When retpolines are enabled they have high overhead in the inner loop
inside kvm_handle_hva_range() that iterates over the provided memory area.
Let's mark this function and its TDP MMU equivalent __always_inline so
compiler will be able to change the call to the actual handler function
inside each of them into a direct one.
This significantly improves performance on the unmap test on the existing
kernel memslot code (tested on a Xeon 8167M machine):
30 slots in use:
Test       Before   After     Improvement
Unmap      0.0353s  0.0334s   5%
Unmap 2M   0.00104s 0.000407s 61%
509 slots in use:
Test       Before   After     Improvement
Unmap      0.0742s  0.0740s   None
Unmap 2M   0.00221s 0.00159s  28%
Looks like having an indirect call in these functions (and, so, a
retpoline) might have interfered with unrolling of the whole loop in the
CPU.
Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Message-Id: <
732d3fe9eb68aa08402a638ab0309199fa89ae56.
1612810129.git.maciej.szmigiero@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 
             slot_rmap_walk_okay(_iter_);                               \
             slot_rmap_walk_next(_iter_))
 
-static int kvm_handle_hva_range(struct kvm *kvm,
-                               unsigned long start,
-                               unsigned long end,
-                               unsigned long data,
-                               int (*handler)(struct kvm *kvm,
-                                              struct kvm_rmap_head *rmap_head,
-                                              struct kvm_memory_slot *slot,
-                                              gfn_t gfn,
-                                              int level,
-                                              unsigned long data))
+static __always_inline int
+kvm_handle_hva_range(struct kvm *kvm,
+                    unsigned long start,
+                    unsigned long end,
+                    unsigned long data,
+                    int (*handler)(struct kvm *kvm,
+                                   struct kvm_rmap_head *rmap_head,
+                                   struct kvm_memory_slot *slot,
+                                   gfn_t gfn,
+                                   int level,
+                                   unsigned long data))
 {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
 
        return ret;
 }
 
-static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start,
-               unsigned long end, unsigned long data,
-               int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot,
-                              struct kvm_mmu_page *root, gfn_t start,
-                              gfn_t end, unsigned long data))
+static __always_inline int
+kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
+                            unsigned long start,
+                            unsigned long end,
+                            unsigned long data,
+                            int (*handler)(struct kvm *kvm,
+                                           struct kvm_memory_slot *slot,
+                                           struct kvm_mmu_page *root,
+                                           gfn_t start,
+                                           gfn_t end,
+                                           unsigned long data))
 {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;