客户环境发生无故重启的现象,查看ipmi也是正常的
环境crash时的调用栈
crash> bt
PID: 24715 TASK: ffff883f3bedef20 CPU: 17 COMMAND: "CPU 3/KVM"
#0 [ffff883e962d38a0] machine_kexec at ffffffff8105c52b
#1 [ffff883e962d3900] __crash_kexec at ffffffff81104a42
#2 [ffff883e962d39d0] crash_kexec at ffffffff81104b30
#3 [ffff883e962d39e8] oops_end at ffffffff816ad378
#4 [ffff883e962d3a10] die at ffffffff8102e97b
#5 [ffff883e962d3a40] do_trap at ffffffff816acac0
#6 [ffff883e962d3a90] do_invalid_op at ffffffff8102b184
#7 [ffff883e962d3b40] invalid_op at ffffffff816b695e
[exception RIP: kvm_zap_rmapp+87]
RIP: ffffffffc05c2457 RSP: ffff883e962d3bf8 RFLAGS: 00010246
RAX: 0000000000000000 RBX: ffffc90065fc21a0 RCX: 0000000000236234
RDX: ffff883ffe9751a0 RSI: ffff883ffe9751a0 RDI: ffff883f0f3b0000
RBP: ffff883e962d3c08 R8: 0000000000000001 R9: 0000000000000000
R10: ffffc90032c03000 R11: 0000000000440000 R12: ffff883f0f3b0000
R13: ffffffffc05c2460 R14: 0000000000000000 R15: ffffc90032c03008
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#8 [ffff883e962d3bf0] kvm_zap_rmapp at ffffffffc05c2441 [kvm]
#9 [ffff883e962d3c10] kvm_unmap_rmapp at ffffffffc05c246e [kvm]
#10 [ffff883e962d3c20] kvm_handle_hva_range at ffffffffc05bfa14 [kvm]
#11 [ffff883e962d3cc0] kvm_unmap_hva_range at ffffffffc05cba87 [kvm]
#12 [ffff883e962d3cd0] kvm_mmu_notifier_invalidate_range_start at ffffffffc05a2aa3 [kvm]
#13 [ffff883e962d3d10] __mmu_notifier_invalidate_range_start at ffffffff811d5d84
#14 [ffff883e962d3d50] change_protection_range at ffffffff811b9e14
#15 [ffff883e962d3e50] change_protection at ffffffff811b9ec5
#16 [ffff883e962d3e88] change_prot_numa at ffffffff811d291b
#17 [ffff883e962d3e98] task_numa_work at ffffffff810c9292
#18 [ffff883e962d3ef0] task_work_run at ffffffff810ad257
#19 [ffff883e962d3f30] do_notify_resume at ffffffff8102ab62
#20 [ffff883e962d3f50] int_signal at ffffffff816b537d
RIP: 00007f26ed470107 RSP: 00007f26cacf2418 RFLAGS: 00000246
RAX: 0000000000000000 RBX: 000055ee21388000 RCX: ffffffffffffffff
RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 0000000000000033
RBP: 000055ee213880a3 R8: 000055ee1e0b0a30 R9: 000055ee2139e000
R10: 0000000000000000 R11: 0000000000000246 R12: 000055ee1e097680
R13: 00007f26cacf59c0 R14: 00007f26f365d000 R15: 000055ee21388000
ORIG_RAX: 0000000000000010 CS: 0033 SS: 002b
看这个代码的callstrace可以发现代码是触发了kernel的BUG_ON也就是,上面提到找这个参数的来源,看一下如下代码:
change_protection_range
-->__mmu_notifier_invalidate_range_start
-->kvm_mmu_notifier_invalidate_range_start
-->kvm_mmu_notifier_invalidate_range_start
(mn->ops->invalidate_range_start(mn, mm, start, end);)
void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
struct mmu_notifier_mm_rhel7 *mmu_notifier_mm;
struct mmu_notifier_rhel7 *mn_rhel7;
struct mmu_notifier *mn;
...
hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_start)
mn->ops->invalidate_range_start(mn, mm, start, end);//也就是这个
}
...
srcu_read_unlock(&srcu, id);
}
如果要追踪到这个kvm_mmu_notifier_invalidate_range_start函数的参数来源就需要从change_protection_range开始,反汇编查看一下:
/usr/src/debug/kernel-3.10.0-693.11.1.el7/linux-3.10.0-693.11.1.el7.es.10.x86_64/include/linux/mmu_notifier.h: 296
0xffffffff811b98d6 <change_protection_range+598>: mov -0x88(%rbp),%rax
0xffffffff811b98dd <change_protection_range+605>: cmpq $0x0,0x340(%rax)
0xffffffff811b98e5 <change_protection_range+613>: jne 0xffffffff811b9e05 <change_protection_range+1925>
/usr/src/debug/kernel-3.10.0-693.11.1.el7/linux-3.10.0-693.11.1.el7.es.10.x86_64/include/linux/mmu_notifier.h: 297
0xffffffff811b9e05 <change_protection_range+1925>: mov -0x38(%rbp),%rdx <=====end
0xffffffff811b9e09 <change_protection_range+1929>: mov %r14,%rsi <=====start
0xffffffff811b9e0c <change_protection_range+1932>: mov %rax,%rdi <=====mm
0xffffffff811b9e0f <change_protection_range+1935>: callq 0xffffffff811d5d20 <__mmu_notifier_invalidate_range_start> //<=====(1)
(1)处调用__mmu_notifier_invalidate_range_start函数,前面的就是传参数,对应的就是
#14 [ffff883e962d3d50] change_protection_range at ffffffff811b9e14
ffff883e962d3d58: 00007f24b8400000 ffff883ffc9a8c80
ffff883e962d3d68: 0000000000000000 00007f24c83fffff
ffff883e962d3d78: 0000000000000000 00007f24c8400000
ffff883e962d3d88: ffff883ea63087f0 0000000000000000
ffff883e962d3d98: 00007f24c83fffff 8000000000000025
ffff883e962d3da8: 00003ffffffff000 ffff880723f93280
ffff883e962d3db8: ffff883e962d3dc8 ffff883ffc9a8c80
-0x88(rbp)
ffff883e962d3dc8: ffff883f102e4490 ffffffffc05ba723
ffff883e962d3dd8: 0000000000000000 0000000000000001
ffff883e962d3de8: 00007f24c8400000 00007f24bfffffff
ffff883e962d3df8: 0000000000000000 0000000100000000
ffff883e962d3e08: ffff880445bca948 00007f24c0000000
-0x38(rbp)
ffff883e962d3e18: ffffffffc05a10a0 ffff880445bca948
ffff883e962d3e28: ffff883ffc9a8c80 00007f24b8400000
ffff883e962d3e38: 00007f24c8400000 ffff883f3bedef20
ffff883e962d3e48: ffff883e962d3e80 ffffffff811b9ec5
rbp rip
可以看到找到如下参数
mm ffff883ffc9a8c80
start (寄存器已经$r14被复用无法得知)
end 00007f24c0000000
根据代码
mmu_notifier_mm = container_of(mm->mmu_notifier_mm,
struct mmu_notifier_mm_rhel7,
b);
crash> mm_struct.mmu_notifier_mm ffff883ffc9a8c80
mmu_notifier_mm = 0xffff883ffcd5ee20
crash> mmu_notifier_mm_rhel7 0xffff883ffcd5ee20
struct mmu_notifier_mm_rhel7 {
b = {
list = {
first = 0xffff883f0f3b3368
},
....
}
crash> mmu_notifier 0xffff883f0f3b3368
struct mmu_notifier {
hlist = {
next = 0x0,
pprev = 0xffff883ffcd5ee20
},
ops = 0xffffffffc05e5ca0
}
crash> struct mmu_notifier_ops 0xffffffffc05e5ca0
struct mmu_notifier_ops {
release = 0xffffffffc059ea00,
clear_flush_young = 0xffffffffc05a2b60,
test_young = 0xffffffffc059e9a0,
change_pte = 0xffffffffc059e920,
invalidate_page = 0xffffffffc05a2ae0,
invalidate_range_start = 0xffffffffc05a2a50,
invalidate_range_end = 0xffffffffc059f690
}
找kvm的参数地址:
crash> struct kvm -ox | grep mmu_notifier
[0x3368] struct mmu_notifier mmu_notifier;
[0x3380] unsigned long mmu_notifier_seq;
[0x3388] long mmu_notifier_count;
crash> px 0xffff883f0f3b3368-0x3368
$3 = 0xffff883f0f3b0000
所以最终的参数如下:
kvm ffff883f0f3b0000
mmu_notifier ffff883f0f3b3368
mm ffff883ffc9a8c80
start 未知
end 00007f24c0000000
最终我们可以看到该虚拟机使用的mem_slot如下
crash> kvm.memslots 0xffff883f0f3b0000
memslots = {0xffffc90032c03000, 0xffffc90032c10000}
因为代码主要是挂在
while ((sptep = rmap_get_first(rmap_head, &iter))) {
1403 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1404 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
而sptep的来源是rmap_get_first,所以我们需要确定map_head的参数,通过对stack的分析
...
0xffffffffc05bf9f9 <kvm_handle_hva_range+281>: nopl 0x0(%rax)
/usr/src/debug/kernel-3.10.0-693.11.1.el7/linux-3.10.0-693.11.1.el7.es.10.x86_64/arch/x86/kvm/mmu.c: 1570
0xffffffffc05bfa00 <kvm_handle_hva_range+288>: mov %r14,%r9
0xffffffffc05bfa03 <kvm_handle_hva_range+291>: mov -0x40(%rbp),%r8d
0xffffffffc05bfa07 <kvm_handle_hva_range+295>: mov -0x50(%rbp),%rcx
0xffffffffc05bfa0b <kvm_handle_hva_range+299>: mov %r15,%rdx
0xffffffffc05bfa0e <kvm_handle_hva_range+302>: mov %r12,%rdi
0xffffffffc05bfa11 <kvm_handle_hva_range+305>: callq *%r13(kvm_unmap_rmapp)
....
stack信息:
#9 [ffff883e962d3c10] kvm_unmap_rmapp at ffffffffc05c246e [kvm]
ffff883e962d3c18: ffff883e962d3cb8 kvm_handle_hva_range+308
#10 [ffff883e962d3c20] kvm_handle_hva_range at ffffffffc05bfa14 [kvm]
ffff883e962d3c28: 000000000000000a 00007f24c0000000
ffff883e962d3c38: 00007f24b8400000 ffffc90032c0e008
ffff883e962d3c48: ffffc90032c03008 0000000000230000
ffff883e962d3c58: 0000000000237bff 0000000300000001
ffff883e962d3c68: 0000000000236234 ffffc90065fc21a0
0x50 0x48
ffff883e962d3c78: 0000000000000001 ffffc90065fceff8
0x40
ffff883e962d3c88: 00000000c7d21833 ffff883f0f3b0000
ffff883e962d3c98: ffff883f0f3b3368 ffff883f0f3b0048
ffff883e962d3ca8: 0000000000000001 00007f24b8400000
ffff883e962d3cb8: ffff883e962d3cc8 kvm_unmap_hva_range+23
#11 [ffff883e962d3cc0] kvm_unmap_hva_range at ffffffffc05cba87 [kvm]
可以推测出具体的参数如下:
rmap_head 0xffffc90065fc21a0
kvm 0xffff883f0f3b0000
slot 0xffffc90032c03008 // memslots = {0xffffc90032c03000, 0xffffc90032c10000}
level 0x0000000000000001
data 0x0000000000000000
static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
struct rmap_iterator *iter)
{
if (!rmap_head->val)
return NULL;
if (!(rmap_head->val & 1)) { //<========(1)
iter->desc = NULL;
return (u64 *)rmap_head->val;
}
iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
iter->pos = 0;
return iter->desc->sptes[iter->pos];
}
获取rmap_head->val的值如下:
crash> px ((struct kvm_rmap_head*)0xffffc90065fc21a0)->val
$17 = 0xffff883ffe9751a0
可以发现会进入(1)处,所以sptep = rmap_get_first(rmap_head, &iter)=0xffff883ffe9751a0的值,读取*sptep的值
crash> px *((struct kvm_rmap_head*)0xffffc90065fc21a0)->val
$18 = 0x5c303078
可以发现在BUG_ON(!(*sptep & PT_PRESENT_MASK)); 处为真所以就出发到kernel的bug了,那为什么会这样呢?
查看这一块内存
crash> rd 0xffff883ffe9751a0 600 | grep -v "0000000000000000 0000000000000000"
ffff883ffe9751a0: 5c3030785c303078 206d766b66303078 x00\x00\x00fkvm
ffff883ffe9751b0: 3a5d31363236325b 7520307570637620 [26261]: vcpu0 u
ffff883ffe9751c0: 64656c646e61686e 203a72736d647220 nhandled rdmsr:
ffff883ffe9751d0: 30785c3261317830 30785c3030785c30 0x1a2\x00\x00\x0
ffff883ffe9751e0: 61785c6264785c30 31785c473f7c5334 0\xdb\xa4S|?G\x1
ffff883ffe9751f0: 785c3c3030785c32 5c3030785c293030 2\x00<\x00)\x00\
ffff883ffe975200: 5c3030785c303078 206d766b66303078 x00\x00\x00fkvm
ffff883ffe975210: 3a5d31363236325b 7520307570637620 [26261]: vcpu0 u
ffff883ffe975220: 64656c646e61686e 203a72736d647220 nhandled rdmsr:
ffff883ffe975230: 30785c3261317830 30785c3030785c30 0x1a2\x00\x00\x0
ffff883ffe975240: 785c3861785c2c30 473f3762785c6565 0,\xa8\xee\xb7?G
ffff883ffe975250: 3030785c3231785c 785c293030785c3c \x12\x00<\x00)\x
ffff883ffe975260: 785c3030785c3030 6b663030785c3030 00\x00\x00\x00fk
ffff883ffe975270: 363236325b206d76 75706376203a5d31 vm [26261]: vcpu
ffff883ffe975280: 646e61686e752030 736d64722064656c 0 unhandled rdms
可以发现基本都是我们message打印的[26261]: vcpu0 unhandled rdms
的这些信息,而我们的message确实一直在刷这个信息,redhat记录说是引入PML功能导致某些内存被破坏了
查看系统是否开启了PML功能
crash> sym -l kvm_intel | grep enable_pml
ffffffffc181490c (d) enable_pml
crash> x/b 0xffffffffc181490c
0xffffffffc181490c <enable_pml>: 0x01
不过https://access.redhat.com/solutions/2924271 提供的说法在RH7.3已经修复,我们的kernel是RH7.4版本的,不过后来查找到redhat也说到在RH7.4也复现了该问题
https://bugzilla.redhat.com/show_bug.cgi?id=1726486
解决方法:
- 升级更新的kernel
- workaround的方法是关闭PML功能
echo "options kvm_intel pml=0" >> /etc/modprobe.d/kvm_intel.conf modprobe -r kvm_intel && modprobe kvm_intel #重新加载模块或是直接重启 cat /sys/module/kvm_intel/parameters/pml #检查pml是否关闭
以为这个代码路径都是numa node banlance引发的问题,RH也提供了一个方法关闭noma node balance的功能
To disable automatic NUMA balancing #echo 0 > /proc/sys/kernel/numa_balancing
参考链接:
https://bugzilla.redhat.com/show_bug.cgi?id=1726486
https://access.redhat.com/solutions/2924271