天天看点

kvm 源代码杂篇

分析KVM,对我这种菜鸟确实难度太大。下面简单的先从虚拟机的创建和运行调用的函数分析。。。

首先申明一个kvm_context_t 变量用以描述用户态虚拟机上下文信息,然后调用kvm_init()函数初始化虚拟机上下文信息;函数kvm_create()创建虚拟机实例,该函数通过ioctl系统调用创建虚拟机相关的内核数据结构并且返回虚拟机文件描述符给用户态kvm_context_t数据结构;

<span style="font-size:18px;">2587 int kvm_init(void *opaque, unsigned int vcpu_size,
2588                   struct module *module)
2589 {
2590         int r;
2591         int cpu;
2592 
2593         r = kvm_arch_init(opaque);
2594         if (r)
2595                 goto out_fail;
2596 
2597         bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2598 
2599         if (bad_page == NULL) {
2600                 r = -ENOMEM;
2601                 goto out;
2602         }
2603 
2604         bad_pfn = page_to_pfn(bad_page);
2605 
2606         if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2607                 r = -ENOMEM;
2608                 goto out_free_0;
2609         }
2610 
2611         r = kvm_arch_hardware_setup();
2612         if (r < 0)
2613                 goto out_free_0a;
2614 
2615         for_each_online_cpu(cpu) {
2616                 smp_call_function_single(cpu,
2617                                 kvm_arch_check_processor_compat,
2618                                 &r, 1);
2619                 if (r < 0)
2620                         goto out_free_1;
2621         }
2622 
2623         on_each_cpu(hardware_enable, NULL, 1);
2624         r = register_cpu_notifier(&kvm_cpu_notifier);
2625         if (r)
2626                 goto out_free_2;
2627         register_reboot_notifier(&kvm_reboot_notifier);
2628 
2629         r = sysdev_class_register(&kvm_sysdev_class);
2630         if (r)
2631                 goto out_free_3;
2632 
2633         r = sysdev_register(&kvm_sysdev);
2634         if (r)
2635                 goto out_free_4;
2636 
2637         /* A kmem cache lets us meet the alignment requirements of fx_save. */
2638         kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
2639                                            __alignof__(struct kvm_vcpu),
2640                                            0, NULL);
2641         if (!kvm_vcpu_cache) {
2642                 r = -ENOMEM;
2643                 goto out_free_5;
2644         }
2645 
2646         kvm_chardev_ops.owner = module;
2647         kvm_vm_fops.owner = module;
2648         kvm_vcpu_fops.owner = module;
2649 
2650         r = misc_register(&kvm_dev);
2651         if (r) {
2652                 printk(KERN_ERR "kvm: misc device register failed\n");
2653                 goto out_free;
2654         }
2655 
2656         kvm_preempt_ops.sched_in = kvm_sched_in;
2657         kvm_preempt_ops.sched_out = kvm_sched_out;
2658 
2659         kvm_init_debug();
2660 
2661         return 0;
2662 
2663 out_free:
2664         kmem_cache_destroy(kvm_vcpu_cache);
2665 out_free_5:
2666         sysdev_unregister(&kvm_sysdev);
2667 out_free_4:
2668         sysdev_class_unregister(&kvm_sysdev_class);
2669 out_free_3:
2670         unregister_reboot_notifier(&kvm_reboot_notifier);
2671         unregister_cpu_notifier(&kvm_cpu_notifier);
2672 out_free_2:
2673         on_each_cpu(hardware_disable, NULL, 1);
2674 out_free_1:
2675         kvm_arch_hardware_unsetup();
2676 out_free_0a:
2677         free_cpumask_var(cpus_hardware_enabled);
2678 out_free_0:
2679         __free_page(bad_page);
2680 out:
2681         kvm_arch_exit();
2682 out_fail:
2683         return r;
2684 }</span>
           

下面稍微详细分析下面流程:

首先,用户态的Qemu代码调用kvm_init函数,kvm_init通过qemu_open(“/dev/kvm”)检查内核驱动插入情况,通过kvm_ioctl(s, KVM_GET_API_VERSION, 0)获取API接口版本,最是调用了kvm_ioctl(s, KVM_CREATE_VM, 0)创建了KVM虚拟机,获取虚拟机句柄。

简单点说,就是在用户态调用了 KVM_Init(),  然后用户态的Qemu调用kvm_ioctl(s, KVM_CREATE_VM, 0)来获取KVM虚拟机接口。那我们必须还要知道调用了这个函数之后会发生什么,也就是KVM是如何由这个函数展开,然后创建虚拟机的。

内核对应的入口代码在此:

<span style="font-size:18px;">static int kvm_dev_ioctl_create_vm(void)
2271 {
2272         int fd;
2273         struct kvm *kvm;
2274 
2275         kvm = kvm_create_vm();
2276         if (IS_ERR(kvm))
2277                 return PTR_ERR(kvm);
2278         fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0);
2279         if (fd < 0)
2280                 kvm_put_kvm(kvm);
2281 
2282         return fd;
2283 }</span>
           

从上面可以看出,是通过Kvm_create_vm来进一步调用。找到kvm_create_vm:

<span style="font-size:18px;">945 static struct kvm *kvm_create_vm(void)
946 {
947         struct kvm *kvm = kvm_arch_create_vm();
948 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
949         struct page *page;
950 #endif
951 
952         if (IS_ERR(kvm))
953                 goto out;
954 #ifdef CONFIG_HAVE_KVM_IRQCHIP
955         INIT_LIST_HEAD(&kvm->irq_routing);
956         INIT_HLIST_HEAD(&kvm->mask_notifier_list);
957 #endif
958 
959 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
960         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
961         if (!page) {
962                 kfree(kvm);
963                 return ERR_PTR(-ENOMEM);
964         }
965         kvm->coalesced_mmio_ring =
966                         (struct kvm_coalesced_mmio_ring *)page_address(page);
967 #endif
968 
969 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
970         {
971                 int err;
972                 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
973                 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
974                 if (err) {
975 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
976                         put_page(page);
977 #endif
978                         kfree(kvm);
979                         return ERR_PTR(err);
980                 }
981         }
982 #endif
983 
984         kvm->mm = current->mm;
985         atomic_inc(&kvm->mm->mm_count);
986         spin_lock_init(&kvm->mmu_lock);
987         spin_lock_init(&kvm->requests_lock);
988         kvm_io_bus_init(&kvm->pio_bus);
989         mutex_init(&kvm->lock);
990         kvm_io_bus_init(&kvm->mmio_bus);
991         init_rwsem(&kvm->slots_lock);
992         atomic_set(&kvm->users_count, 1);
993         spin_lock(&kvm_lock);
994         list_add(&kvm->vm_list, &vm_list);
995         spin_unlock(&kvm_lock);
996 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
997         kvm_coalesced_mmio_init(kvm);
998 #endif
999 out:
1000         return kvm;
1001 }</span>
           

这里kvm_arch_create_vm():是用来初始化KVM结构体信息。

总结这个函数吧,kvm_create_vm事实上也就做了初始化和启动硬件特性两件事,然后将相应的句柄返回给用户态。

创建完内核虚拟机数据结构后,再创建内核pit以及mmio等基本外设模拟设备,然后调用kvm_create_vcpu()函数来创建虚拟处理器,kvm_create_vcpu()

下面看下kvm_create_vcpu()函数

<span style="font-size:18px;">1726 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
1727 {
1728         int r;
1729         struct kvm_vcpu *vcpu;
1730 
1731         if (!valid_vcpu(n))
1732                 return -EINVAL;
1733 
1734         vcpu = kvm_arch_vcpu_create(kvm, n);
1735         if (IS_ERR(vcpu))
1736                 return PTR_ERR(vcpu);
1737 
1738         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1739 
1740         r = kvm_arch_vcpu_setup(vcpu);
1741         if (r)
1742                 return r;
1743 
1744         mutex_lock(&kvm->lock);
1745         if (kvm->vcpus[n]) {
1746                 r = -EEXIST;
1747                 goto vcpu_destroy;
1748         }
1749         kvm->vcpus[n] = vcpu;
1750         mutex_unlock(&kvm->lock);
1751 
1752         /* Now it's all set up, let userspace reach it */
1753         kvm_get_kvm(kvm);
1754         r = create_vcpu_fd(vcpu);
1755         if (r < 0)
1756                 goto unlink;
1757         return r;
1758 
1759 unlink:
1760         mutex_lock(&kvm->lock);
1761         kvm->vcpus[n] = NULL;
1762 vcpu_destroy:
1763         mutex_unlock(&kvm->lock);
1764         kvm_arch_vcpu_destroy(vcpu);
1765         return r;
1766 }
1767 </span>
           
<span style="font-size:18px;">4365 
4366 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
4367 {
4368         int r;
4369 
4370         /* We do fxsave: this must be aligned. */
4371         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
4372 
4373         vcpu->arch.mtrr_state.have_fixed = 1;
4374         vcpu_load(vcpu);
4375         r = kvm_arch_vcpu_reset(vcpu);
4376         if (r == 0)
4377                 r = kvm_mmu_setup(vcpu);
4378         vcpu_put(vcpu);
4379         if (r < 0)
4380                 goto free_vcpu;
4381 
4382         return 0;
4383 free_vcpu:
4384         kvm_x86_ops->vcpu_free(vcpu);
4385         return r;
4386 }</span>
           

继续流程分析:

函数通过ioctl()系统调用向由vm_fd文件描述符指向的虚拟文件调用创建虚拟处理器,并将虚拟处理器的文件描述符返回给用户态程序,用以以后的调度使用;

好,CPU的初始化和创建暂时完成:下面是内存,即影子页表的初始化:

创建完虚拟处理器后,由用户态的QEMU程序申请客户机用户空间,用以加载和运行客户机代码;为了使得客户虚拟机正确执行,必须要在内核中为客户机建立正确的内存映射关系,即影子页表信息。因此,申请客户机内存地址空间后,调用函数kvm_create_phys_mem()创建客户机内存映射关系,该函数主要通过ioctl系统调用向vm_fd指向的虚拟文件调用设置内核数据结构中客户机内存域相关信息,主要建立影子页表信息;当创建好虚拟处理器和影子页表后,即可读取客户机到指定分配的空间中,然后调度虚拟处理器运行。

kvm_create_phys_mem():代码在此

945 static struct kvm *kvm_create_vm(void)
946 {
947         struct kvm *kvm = kvm_arch_create_vm();
948 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
949         struct page *page;
950 #endif
951 
952         if (IS_ERR(kvm))
953                 goto out;
954 #ifdef CONFIG_HAVE_KVM_IRQCHIP
955         INIT_LIST_HEAD(&kvm->irq_routing);
956         INIT_HLIST_HEAD(&kvm->mask_notifier_list);
957 #endif
958 
959 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
960         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
961         if (!page) {
962                 kfree(kvm);
963                 return ERR_PTR(-ENOMEM);
964         }
965         kvm->coalesced_mmio_ring =
966                         (struct kvm_coalesced_mmio_ring *)page_address(page);
967 #endif
968 
969 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
970         {
971                 int err;
972                 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
973                 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
974                 if (err) {
975 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
976                         put_page(page);
977 #endif
978                         kfree(kvm);
979                         return ERR_PTR(err);
980                 }
981         }
982 #endif
983 
984         kvm->mm = current->mm;
985         atomic_inc(&kvm->mm->mm_count);
986         spin_lock_init(&kvm->mmu_lock);
987         spin_lock_init(&kvm->requests_lock);
988         kvm_io_bus_init(&kvm->pio_bus);
989         mutex_init(&kvm->lock);
990         kvm_io_bus_init(&kvm->mmio_bus);
991         init_rwsem(&kvm->slots_lock);
992         atomic_set(&kvm->users_count, 1);
993         spin_lock(&kvm_lock);
994         list_add(&kvm->vm_list, &vm_list);
995         spin_unlock(&kvm_lock);
996 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
997         kvm_coalesced_mmio_init(kvm);
998 #endif
999 out:
1000         return kvm;
1001 }
1002 
           

内存创建之后,即可以运行虚拟机了。

调度虚拟机的函数为kvm_run(),代码如下:

3466 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3467 {
3468         int r;
3469         sigset_t sigsaved;
3470 
3471         vcpu_load(vcpu);
3472 
3473         if (vcpu->sigset_active)
3474                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3475 
3476         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
3477                 kvm_vcpu_block(vcpu);
3478                 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
3479                 r = -EAGAIN;
3480                 goto out;
3481         }
3482 
3483         /* re-sync apic's tpr */
3484         if (!irqchip_in_kernel(vcpu->kvm))
3485                 kvm_set_cr8(vcpu, kvm_run->cr8);
3486 
3487         if (vcpu->arch.pio.cur_count) {
3488                 r = complete_pio(vcpu);
3489                 if (r)
3490                         goto out;
3491         }
3492 #if CONFIG_HAS_IOMEM
3493         if (vcpu->mmio_needed) {
3494                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3495                 vcpu->mmio_read_completed = 1;
3496                 vcpu->mmio_needed = 0;
3497 
3498                 down_read(&vcpu->kvm->slots_lock);
3499                 r = emulate_instruction(vcpu, kvm_run,
3500                                         vcpu->arch.mmio_fault_cr2, 0,
3501                                         EMULTYPE_NO_DECODE);
3502                 up_read(&vcpu->kvm->slots_lock);
3503                 if (r == EMULATE_DO_MMIO) {
3504                         /*
3505                          * Read-modify-write.  Back to userspace.
3506                          */
3507                         r = 0;
3508                         goto out;
3509                 }
3510         }
3511 #endif
3512         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3513                 kvm_register_write(vcpu, VCPU_REGS_RAX,
3514                                      kvm_run->hypercall.ret);
3515 
3516         r = __vcpu_run(vcpu, kvm_run);
3517 
3518 out:
3519         if (vcpu->sigset_active)
3520                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
3521 
3522         vcpu_put(vcpu);
3523         return r;
3524 }
           

该函数通过ioctl系统调用调用由虚拟处理器文件描述符指向的虚拟文件调度处理函数kvm_run()调度虚拟处理器的执行,该系统调用将虚拟处理器vcpu信息加载到物理处理器中,通过vm_entry执行进入客户机执行。

后面就是陷入和捕获以及上下文切换了,后面分析。。。

在客户机正常运行期间kvm_run()函数不返回,只有发生以下两种情况时,函数返回:1,发生了I/O事件,如客户机发出读写I/O的指令;2,产生了客户机和内核KVM都无法处理的异常。I/O事件处理完毕后,通过重新调用KVM_RUN()函数继续调度客户机的执行。

大致流程就是如此,还得继续细细分析。