From f1cf7466c762b26a586cc21b4d3463d0211a2745 Mon Sep 17 00:00:00 2001 From: Achilleas Anagnostopoulos Date: Fri, 27 Apr 2018 19:26:08 +0100 Subject: [PATCH 1/2] Revert "kfmt: fix bug where calls to copy() resulted in garbage being copied" This reverts commit 8f04deadc1f1620dbd0aaea07f04a6c7f82f58c0. The actual issue was triggered by a memory corruption due to the fact that the exception handling gate code did not preserve XMM regs. --- src/gopheros/kernel/kfmt/ringbuf.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/gopheros/kernel/kfmt/ringbuf.go b/src/gopheros/kernel/kfmt/ringbuf.go index c377dbc..2bd5baa 100644 --- a/src/gopheros/kernel/kfmt/ringbuf.go +++ b/src/gopheros/kernel/kfmt/ringbuf.go @@ -40,9 +40,10 @@ func (rb *ringBuffer) Read(p []byte) (n int, err error) { n = pLen } - for i := 0; i < n; i, rb.rIndex = i+1, rb.rIndex+1 { - p[i] = rb.buffer[rb.rIndex] - } + copy(p, rb.buffer[rb.rIndex:rb.rIndex+n]) + rb.rIndex += n + + return n, nil case rb.rIndex > rb.wIndex: // Read up to min(len(buf) - rIndex, len(p)) bytes n = len(rb.buffer) - rb.rIndex @@ -50,17 +51,15 @@ func (rb *ringBuffer) Read(p []byte) (n int, err error) { n = pLen } - for i := 0; i < n; i, rb.rIndex = i+1, rb.rIndex+1 { - p[i] = rb.buffer[rb.rIndex] - } + copy(p, rb.buffer[rb.rIndex:rb.rIndex+n]) + rb.rIndex += n if rb.rIndex == len(rb.buffer) { rb.rIndex = 0 } + return n, nil default: // rIndex == wIndex - n, err = 0, io.EOF + return 0, io.EOF } - - return n, err } From fd862d0d15f97c9fd854dd279a3aeee851464c17 Mon Sep 17 00:00:00 2001 From: Achilleas Anagnostopoulos Date: Fri, 27 Apr 2018 20:13:57 +0100 Subject: [PATCH 2/2] rt0_64: preserve XMM registers when calling gate handlers Some gate handlers may invoke Go runtime code that clobbers the XMM registers that could be in use by the code that triggered the gate handler. One case where this happens is when a growing a slice and the new location for the slice data resides in a RO page with a CoW flag. In this scenario, runtime.growslice will invoke runtime.memmove which may take a code path that uses XMM registers for copying data around. During the copy, a page fault occurs and the kernel page fault handler will detect the CoW flag, allocate a new frame and copy the original data to the new frame before resuming execution. As the page copy is performed using the built-in copy function this will cause the XMM registers to be clobbered. To prevent this from happening, the asm gate code that gets executed when an exception occurs will now preserve the XMM regs on the stack before invoking the registered exception handler. In addition, for Go 1.9+ the gate code will also temporarily disable use of AVX instructions by runtime.memmove by setting runtime.useAVXmemmove to 0. This is required so the gate does not need to also preserve any AVX registers. --- Makefile | 6 +- src/arch/x86_64/asm/rt0_64.s | 214 +++++++++++++++++++++++------------ 2 files changed, 148 insertions(+), 72 deletions(-) diff --git a/Makefile b/Makefile index 07cfefb..bce8f30 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,8 @@ GOROOT := $(shell $(GO) env GOROOT) GC_FLAGS ?= LD_FLAGS := -n -T $(BUILD_DIR)/linker.ld -static --no-ld-generated-unwind-info AS_FLAGS := -g -f elf64 -F dwarf -I $(BUILD_DIR)/ -I src/arch/$(ARCH)/asm/ \ - -dNUM_REDIRECTS=$(shell GOPATH=$(GOPATH) $(GO) run tools/redirects/redirects.go count) + -dNUM_REDIRECTS=$(shell GOPATH=$(GOPATH) $(GO) run tools/redirects/redirects.go count) \ + -dWITH_RUNTIME_AVXMEMMOVE=$(shell grep -r "var useAVXmemmove" $(GOROOT)/src/runtime/ | wc -l) MIN_OBJCOPY_VERSION := 2.26.0 HAVE_VALID_OBJCOPY := $(shell objcopy -V | head -1 | awk -F ' ' '{print "$(MIN_OBJCOPY_VERSION)\n" $$NF}' | sort -ct. -k1,1n -k2,2n && echo "y") @@ -74,13 +75,14 @@ go.o: @# objcopy to make that symbol exportable. Since nasm does not support externs @# with slashes we create a global symbol alias for kernel.Kmain @echo "[objcopy] create kernel.Kmain alias to gopheros/kernel/kmain.Kmain" - @echo "[objcopy] globalizing symbols {_rt0_interrupt_handlers, runtime.g0/m0/physPageSize}" + @echo "[objcopy] globalizing symbols {_rt0_interrupt_handlers, runtime.g0/m0/physPageSize/useAVXmemmove}" @objcopy \ --add-symbol kernel.Kmain=.text:0x`nm $(BUILD_DIR)/go.o | grep "kmain.Kmain$$" | cut -d' ' -f1` \ --globalize-symbol _rt0_interrupt_handlers \ --globalize-symbol runtime.g0 \ --globalize-symbol runtime.m0 \ --globalize-symbol runtime.physPageSize \ + --globalize-symbol runtime.useAVXmemmove \ $(BUILD_DIR)/go.o $(BUILD_DIR)/go.o binutils_version_check: diff --git a/src/arch/x86_64/asm/rt0_64.s b/src/arch/x86_64/asm/rt0_64.s index fb42fc9..784331e 100644 --- a/src/arch/x86_64/asm/rt0_64.s +++ b/src/arch/x86_64/asm/rt0_64.s @@ -27,6 +27,14 @@ _rt0_irq_handlers resq IDT_ENTRIES r0_g_ptr: resq 1 tcb_ptr: resq 1 +; Go < 1.9 does not define runtime.useAVXmemmove; to avoid linker errors define +; a dummy symbol so that the gate entry code can work as expected. +%if WITH_RUNTIME_AVXMEMMOVE == 0 + runtime.useAVXmemmove resb 1 +%else + extern runtime.useAVXmemmove +%endif + section .text ;------------------------------------------------------------------------------ @@ -182,7 +190,7 @@ _rt0_64_gate_entry_%+ gate_num: %assign gate_num gate_num+1 %endrep -%macro save_regs 0 +%macro save_gp_regs 0 push r15 push r14 push r13 @@ -200,7 +208,7 @@ _rt0_64_gate_entry_%+ gate_num: push rax %endmacro -%macro restore_regs 0 +%macro restore_gp_regs 0 pop rax pop rbx pop rcx @@ -218,97 +226,163 @@ _rt0_64_gate_entry_%+ gate_num: pop r15 %endmacro +%macro save_xmm_regs 0 + sub rsp, 16*16 + movdqu [rsp+0*16], xmm0 + movdqu [rsp+1*16], xmm1 + movdqu [rsp+2*16], xmm2 + movdqu [rsp+3*16], xmm3 + movdqu [rsp+4*16], xmm4 + movdqu [rsp+5*16], xmm5 + movdqu [rsp+6*16], xmm6 + movdqu [rsp+7*16], xmm7 + movdqu [rsp+8*16], xmm8 + movdqu [rsp+9*16], xmm9 + movdqu [rsp+10*16], xmm10 + movdqu [rsp+11*16], xmm11 + movdqu [rsp+12*16], xmm12 + movdqu [rsp+13*16], xmm13 + movdqu [rsp+14*16], xmm14 + movdqu [rsp+15*16], xmm15 +%endmacro + +%macro restore_xmm_regs 0 + movdqu xmm0, [rsp+0*16] + movdqu xmm1, [rsp+1*16] + movdqu xmm2, [rsp+2*16] + movdqu xmm3, [rsp+3*16] + movdqu xmm4, [rsp+4*16] + movdqu xmm5, [rsp+5*16] + movdqu xmm6, [rsp+6*16] + movdqu xmm7, [rsp+7*16] + movdqu xmm8, [rsp+8*16] + movdqu xmm9, [rsp+9*16] + movdqu xmm10, [rsp+10*16] + movdqu xmm11, [rsp+11*16] + movdqu xmm12, [rsp+12*16] + movdqu xmm13, [rsp+13*16] + movdqu xmm14, [rsp+14*16] + movdqu xmm15, [rsp+15*16] + add rsp, 16*16 +%endmacro + + ;------------------------------------------------------------------------------ ; This dispatcher is invoked by gate entries that expect a code to be pushed -; by the CPU to the stack. It performs the following functions: -; - save registers -; - push pointer to saved regs -; - push pointer to stack frame -; - read and push exception code -; - invoke handler(code, &frame, ®s) -; - restore registers -; - pop exception code from stack so rsp points to the stack frame +; by the CPU to the stack. +; +; This is the stack layout used by this function. Items are 8-bytes +; wide with the exception of the xmm regs that are 16 bytes wide +; +; ----------------| +; useAVXmemmove | <- original value of runtime.useAVXmemmove +;-----------------| +; xmm regs (16) | +;-----------------| <- RBP will point at the last pushed GP reg +; gp regs (15) | +;-----------------| +; handler address | <- pushed by gate_entry_xxx (RSP initially points here) +;-----------------| +; exception code | <- pushed by CPU (must be popped before returning) +;-----------------| +; RIP | <- pushed by CPU (exception frame) +; CS | +; RFLAGS | +; RSP | +; SS | +;----------------- ;------------------------------------------------------------------------------ _rt0_64_gate_dispatcher_with_code: - ; This is how the stack looks like when entering this function: - ; (each item is 8-bytes wide) - ; - ;------------------ - ; handler address | <- pushed by gate_entry_xxx (RSP points here) - ;-----------------| - ; Exception code | <- needs to be removed from stack before calling iretq - ;-----------------| - ; RIP | <- exception frame - ; CS | - ; RFLAGS | - ; RSP | - ; SS | - ;----------------- cld - ; save regs and push a pointer to them - save_regs - mov rax, rsp ; rax points to saved rax - push rax ; push pointer to saved regs + ; save general-purpose regs + save_gp_regs + mov rbp, rsp ; rbp points to saved rax - ; push pointer to exception stack frame (we have used 15 qwords for the - ; saved registers plus one qword for the data pushed by the gate entry - ; plus one extra qword to jump over the exception code) - add rax, 17*8 - push rax + ; save xmm regs as the fault handler may clobber them by calling an + ; SSE-enabled runtime function like copy (calls runtime.memmove). In + ; addition temporarily disable AVX support for runtime.memmove so we + ; don't need to also preserve the avx regs. + save_xmm_regs + mov rax, runtime.useAVXmemmove + push qword [rax] + mov byte [rax], 0 - ; push exception code (located between the stack frame and the saved regs) - sub rax, 8 - push qword [rax] + ; push exception handler args and call registered handler + push qword [rbp] ; ptr to regs + push qword [rbp+17*8] ; ptr to exception frame + push qword [rbp+16*8] ; exception code + call qword [rbp+15*8] + add rsp, 3 * 8 - call [rsp + 18*8] ; call registered irq handler + ; restore xmm regs and restore AVX support for runtime.memmove + mov rax, runtime.useAVXmemmove + pop rbx + mov byte [rax], bl + restore_xmm_regs - add rsp, 3 * 8 ; unshift the pushed arguments so rsp points to the saved regs - restore_regs + ; restore general purpose regs + restore_gp_regs - add rsp, 16 ; pop handler address and exception code off the stack before returning + ; pop handler address + exception code so RSP points to the stack frame. + add rsp, 2*8 iretq ;------------------------------------------------------------------------------ ; This dispatcher is invoked by gate entries that do not use exception codes. -; It performs the following functions: -; - save registers -; - push pointer to saved regs -; - push pointer to stack frame -; - invoke handler(&frame, ®s) -; - restore registers +; +; This is the stack layout used by this function. Items are 8-bytes +; wide with the exception of the xmm regs that are 16 bytes wide +; +; ----------------| +; useAVXmemmove | <- original value of runtime.useAVXmemmove +;-----------------| +; xmm regs (16) | +;-----------------| <- RBP will point at the last pushed GP reg +; gp regs (15) | +;-----------------| +; handler address | <- pushed by gate_entry_xxx (RSP initially points here) +;-----------------| +; RIP | <- pushed by CPU (exception frame) +; CS | +; RFLAGS | +; RSP | +; SS | +;----------------- ;------------------------------------------------------------------------------ _rt0_64_gate_dispatcher_without_code: - ; This is how the stack looks like when entering this function: - ; (each item is 8-bytes wide) - ; - ;------------------ - ; handler address | <- pushed by gate_entry_xxx (RSP points here) - ;-----------------| - ; RIP | <- exception frame - ; CS | - ; RFLAGS | - ; RSP | - ; SS | - ;----------------- cld - ; save regs and push a pointer to them - save_regs - mov rax, rsp ; rax points to saved rax - push rax ; push pointer to saved regs + ; save general-purpose regs + save_gp_regs + mov rbp, rsp ; rbp points to saved rax - ; push pointer to exception stack frame (we have used 15 qwords for the - ; saved registers plus one qword for the data pushed by the gate entry) - add rax, 16*8 - push rax + ; save xmm regs as the fault handler may clobber them by calling an + ; SSE-enabled runtime function like copy (calls runtime.memmove). In + ; addition temporarily disable AVX support for runtime.memmove so we + ; don't need to also preserve the avx regs. + save_xmm_regs + mov rax, runtime.useAVXmemmove + push qword [rax] + mov byte [rax], 0 - call [rsp + 17*8] ; call registered irq handler + ; push exception handler args and call registered handler + push qword [rbp] ; ptr to regs + push qword [rbp+16*8] ; ptr to exception frame + call qword [rbp+15*8] + add rsp, 2 * 8 - add rsp, 2 * 8 ; unshift the pushed arguments so rsp points to the saved regs - restore_regs + ; restore xmm regs and restore AVX support for runtime.memmove + mov rax, runtime.useAVXmemmove + pop rbx + mov byte [rax], bl + restore_xmm_regs - add rsp, 8 ; pop handler address off the stack before returning + ; restore general purpose regs + restore_gp_regs + + ; pop handler address so RSP points to the stack frame. + add rsp, 8 iretq ;------------------------------------------------------------------------------