26 Nov 2016

Execution switching between host and guest

Author: derek0883@gmail.com

Now I know Guest OS IO accessing will be translated to memory accessing to the virtual deivce object. CPU is a device too, so accessing to register of vCPU is memory accessing. I’m still curious about the execution flow of guest code.

x86 CPU context object

Default x86 CPU context object is CPUX86State in target-i386/cpu.h, more information about each register can be found in x86_cpu_dump_state.

typedef struct CPUX86State {
    /* standard registers */
    target_ulong regs[CPU_NB_REGS];
    target_ulong eip;
    target_ulong eflags; /* eflags register. During CPU emulation, CC
                        flags and DF are set to zero because they are
                        stored elsewhere */

#define R_EAX 0
#define R_ECX 1
#define R_EDX 2
#define R_EBX 3
#define R_ESP 4
#define R_EBP 5
#define R_ESI 6
#define R_EDI 7

Guest code execution

From previou studying, I know QEUM will call cpu_tb_exec to execute guest code.

$ gdb ./x86_64-softmmu/qemu-system-x86_64
(gdb) b cpu_tb_exec
(gdb) r images/cirros-d161007-x86_64-disk.img -monitor stdio -D /tmp/qemu.log -d in_asm,op,out_asm
#0 0x0000555555762921 in cpu_tb_exec at qemu-2.8.0-rc0/cpu-exec.c:164
#1 0x00005555557635c3 in cpu_loop_exec_tb at qemu-2.8.0-rc0/cpu-exec.c:544
#2 0x0000555555763856 in cpu_exec at qemu-2.8.0-rc0/cpu-exec.c:638
#3 0x0000555555792825 in tcg_cpu_exec at qemu-2.8.0-rc0/cpus.c:1117
#4 0x0000555555792a69 in qemu_tcg_cpu_thread_fn at qemu-2.8.0-rc0/cpus.c:1197
#5 0x00007ffff65746fa in start_thread (arg=0x7fffe9318700) at pthread_create.c:333
#6 0x00007ffff62aab5d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

In GDB press n until hit this line:

ret = tcg_qemu_tb_exec(env, tb_ptr);

the definition of tcg_qemu_tb_exec

# define tcg_qemu_tb_exec(env, tb_ptr) \
    ((uintptr_t (*)(void *, void *))tcg_ctx.code_gen_prologue)(env, tb_ptr)

tcg_ctx.code_gen_prologue point to generated code. In GDB:

(gdb) print &env->eip
$2 = (target_ulong *) 0x5555568370c0
(gdb) print env
$3 = (struct CPUX86State *) 0x555556837040

The offset of EIP is 0x80. using those value later, thre is no source code for transfered code, So switch to asm layout, and using “si” commnad until

0x55555576291f <cpu_tb_exec+254>        callq  *%rcx  
(gdb) info register rcx
rcx            0x7fffe9fff000   140737119252480

Address is 0x7fffe9fff000, this call will transfer from host code to guest code, open qemu.log file, 0x7fffe9fff000 is the starting address.

PROLOGUE: [size=40]
0x7fffe9fff000:  push   %rbp    0x55
0x7fffe9fff001:  push   %rbx    0x53
0x7fffe9fff002:  push   %r12    0x4154
0x7fffe9fff004:  push   %r13    0x4155
0x7fffe9fff006:  push   %r14    0x4156
0x7fffe9fff008:  push   %r15    0x4157
0x7fffe9fff00a:  mov    %rdi,%r14   0x4c8bf7
0x7fffe9fff00d:  add    $0xfffffffffffffb78,%rsp    0x4881c478fbffff
0x7fffe9fff014:  jmpq   *%rsi   0xffe6
0x7fffe9fff016:  add    $0x488,%rsp 0x4881c488040000
0x7fffe9fff01d:  pop    %r15    0x415f
0x7fffe9fff01f:  pop    %r14    0x415e
0x7fffe9fff021:  pop    %r13    0x415d
0x7fffe9fff023:  pop    %r12    0x415c
0x7fffe9fff025:  pop    %rbx    0x5b
0x7fffe9fff026:  pop    %rbp    0x5d
0x7fffe9fff027:  retq       0xc3

0x00000000fffffff0:  ljmp   $0xf000,$0xe05b

OUT: [size=64]
0x7fffe9fff028:  mov    -0x8(%r14),%ebp 0x418b6ef8
0x7fffe9fff02c:  test   %ebp,%ebp   0x85ed
0x7fffe9fff02e:  jne    0x7fffe9fff05c  0x0f8528000000
0x7fffe9fff034:  movl   $0xf000,0xd0(%r14)  0x41c786d000000000f00000
0x7fffe9fff03f:  movq   $0xf0000,0xd8(%r14) 0x49c786d800000000000f00
0x7fffe9fff04a:  movq   $0xe05b,0x80(%r14)  0x49c786800000005be00000
0x7fffe9fff055:  xor    %eax,%eax   0x33c0
0x7fffe9fff057:  jmpq   0x7fffe9fff016  0xe9baffffff
0x7fffe9fff05c:  lea    0x6549fb0(%rip),%rax        # 0x7ffff0549013    0x488d05b09f5406
0x7fffe9fff063:  jmpq   0x7fffe9fff016  0xe9aeffffff

From IBM PC standard I know the 1st instruction of BIOS starting at 0x00000000fffffff0. ljmp $0xf000,$0xe05b, CPU execute this guest code by setting (CPUX86State*)r14->eip to 0xe05b, and also the CS selector to 0xf000.

The push instruction generated by tcg_target_qemu_prologue, before transfer to guest code, QEUM save some register on stack, when return from guest code, it will restore those registers.

/* Generate global QEMU prologue and epilogue code */
static void tcg_target_qemu_prologue(TCGContext *s)
    int i, stack_addend;

    /* TB prologue */

    /* Reserve some stack space, also for TCG temps.  */
    stack_addend = FRAME_SIZE - PUSH_SIZE;
                  CPU_TEMP_BUF_NLONGS * sizeof(long));

    /* Save all callee saved registers.  */
    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
        tcg_out_push(s, tcg_target_callee_save_regs[i]);

    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
    /* jmp *tb.  */
    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
		         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
			 + stack_addend);
    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
    /* jmp *tb.  */
    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);

    /* TB epilogue */
    tb_ret_addr = s->code_ptr;

    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);

    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
    tcg_out_opc(s, OPC_RET, 0, 0, 0);

#if !defined(CONFIG_SOFTMMU)
    /* Try to set up a segment register to point to guest_base.  */
    if (guest_base) {

Further more, I used watch break point to monitor eip changes, which give me lots of information about how guest code executed.

(gdb) delete 1
(gdb) print &env->eip
$3 = (target_ulong *) 0x555556837290
(gdb) watch *0x555556837290
Hardware watchpoint 3: *0x555556837290
(gdb) layout asm
(gdb) c
(gdb) c