Reject KVM_RUN if emulation is required (because VMX is running without
unrestricted guest) and an exception is pending, as KVM doesn't support
emulating exceptions except when emulating real mode via vm86.  The vCPU
is hosed either way, but letting KVM_RUN proceed triggers a WARN due to
the impossible condition.  Alternatively, the WARN could be removed, but
then userspace and/or KVM bugs would result in the vCPU silently running
in a bad state, which isn't very friendly to users.
Originally, the bug was hit by syzkaller with a nested guest as that
doesn't require kvm_intel.unrestricted_guest=0.  That particular flavor
is likely fixed by commit 
cd0e615c49e5 ("KVM: nVMX: Synthesize
TRIPLE_FAULT for L2 if emulation is required"), but it's trivial to
trigger the WARN with a non-nested guest, and userspace can likely force
bad state via ioctls() for a nested guest as well.
Checking for the impossible condition needs to be deferred until KVM_RUN
because KVM can't force specific ordering between ioctls.  E.g. clearing
exception.pending in KVM_SET_SREGS doesn't prevent userspace from setting
it in KVM_SET_VCPU_EVENTS, and disallowing KVM_SET_VCPU_EVENTS with
emulation_required would prevent userspace from queuing an exception and
then stuffing sregs.  Note, if KVM were to try and detect/prevent the
condition prior to KVM_RUN, handle_invalid_guest_state() and/or
handle_emulation_failure() would need to be modified to clear the pending
exception prior to exiting to userspace.
 ------------[ cut here ]------------
 WARNING: CPU: 6 PID: 137812 at arch/x86/kvm/vmx/vmx.c:1623 vmx_queue_exception+0x14f/0x160 [kvm_intel]
 CPU: 6 PID: 137812 Comm: vmx_invalid_nes Not tainted 5.15.2-
7cc36c3e14ae-pop #279
 Hardware name: ASUS Q87M-E/Q87M-E, BIOS 1102 03/03/2014
 RIP: 0010:vmx_queue_exception+0x14f/0x160 [kvm_intel]
 Code: <0f> 0b e9 fd fe ff ff 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00
 RSP: 0018:
ffffa45c83577d38 EFLAGS: 
00010202
 RAX: 
0000000000000003 RBX: 
0000000080000006 RCX: 
0000000000000006
 RDX: 
0000000000000000 RSI: 
0000000000010002 RDI: 
ffff9916af734000
 RBP: 
ffff9916af734000 R08: 
0000000000000000 R09: 
0000000000000000
 R10: 
0000000000000000 R11: 
0000000000000001 R12: 
0000000000000006
 R13: 
0000000000000000 R14: 
ffff9916af734038 R15: 
0000000000000000
 FS:  
00007f1e1a47c740(0000) GS:
ffff99188fb80000(0000) knlGS:
0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 
0000000080050033
 CR2: 
00007f1e1a6a8008 CR3: 
000000026f83b005 CR4: 
00000000001726e0
 Call Trace:
  kvm_arch_vcpu_ioctl_run+0x13a2/0x1f20 [kvm]
  kvm_vcpu_ioctl+0x279/0x690 [kvm]
  __x64_sys_ioctl+0x83/0xb0
  do_syscall_64+0x3b/0xc0
  entry_SYSCALL_64_after_hwframe+0x44/0xae
Reported-by: syzbot+82112403ace4cbd780d8@syzkaller.appspotmail.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <
20211228232437.
1875318-2-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 KVM_X86_OP_NULL(tlb_remote_flush_with_range)
 KVM_X86_OP(tlb_flush_gva)
 KVM_X86_OP(tlb_flush_guest)
+KVM_X86_OP(vcpu_pre_run)
 KVM_X86_OP(run)
 KVM_X86_OP_NULL(handle_exit)
 KVM_X86_OP_NULL(skip_emulated_instruction)
 
         */
        void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
 
+       int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
        enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
        int (*handle_exit)(struct kvm_vcpu *vcpu,
                enum exit_fastpath_completion exit_fastpath);
 
        svm_complete_interrupts(vcpu);
 }
 
+static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+       return 1;
+}
+
 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
        if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
        .tlb_flush_gva = svm_flush_tlb_gva,
        .tlb_flush_guest = svm_flush_tlb,
 
+       .vcpu_pre_run = svm_vcpu_pre_run,
        .run = svm_vcpu_run,
        .handle_exit = handle_exit,
        .skip_emulated_instruction = skip_emulated_instruction,
 
        return 1;
 }
 
+static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       return vmx->emulation_required && !vmx->rmode.vm86_active &&
+              vcpu->arch.exception.pending;
+}
+
 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
                if (!kvm_emulate_instruction(vcpu, 0))
                        return 0;
 
-               if (vmx->emulation_required && !vmx->rmode.vm86_active &&
-                   vcpu->arch.exception.pending) {
+               if (vmx_emulation_required_with_pending_exception(vcpu)) {
                        kvm_prepare_emulation_failure_exit(vcpu);
                        return 0;
                }
        return 1;
 }
 
+static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+       if (vmx_emulation_required_with_pending_exception(vcpu)) {
+               kvm_prepare_emulation_failure_exit(vcpu);
+               return 0;
+       }
+
+       return 1;
+}
+
 static void grow_ple_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        .tlb_flush_gva = vmx_flush_tlb_gva,
        .tlb_flush_guest = vmx_flush_tlb_guest,
 
+       .vcpu_pre_run = vmx_vcpu_pre_run,
        .run = vmx_vcpu_run,
        .handle_exit = vmx_handle_exit,
        .skip_emulated_instruction = vmx_skip_emulated_instruction,
 
        } else
                WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
-       if (kvm_run->immediate_exit)
+       if (kvm_run->immediate_exit) {
                r = -EINTR;
-       else
-               r = vcpu_run(vcpu);
+               goto out;
+       }
+
+       r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
+       if (r <= 0)
+               goto out;
+
+       r = vcpu_run(vcpu);
 
 out:
        kvm_put_guest_fpu(vcpu);