diff --git a/Makefile b/Makefile index 07cfefb..bce8f30 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,8 @@ GOROOT := $(shell $(GO) env GOROOT) GC_FLAGS ?= LD_FLAGS := -n -T $(BUILD_DIR)/linker.ld -static --no-ld-generated-unwind-info AS_FLAGS := -g -f elf64 -F dwarf -I $(BUILD_DIR)/ -I src/arch/$(ARCH)/asm/ \ - -dNUM_REDIRECTS=$(shell GOPATH=$(GOPATH) $(GO) run tools/redirects/redirects.go count) + -dNUM_REDIRECTS=$(shell GOPATH=$(GOPATH) $(GO) run tools/redirects/redirects.go count) \ + -dWITH_RUNTIME_AVXMEMMOVE=$(shell grep -r "var useAVXmemmove" $(GOROOT)/src/runtime/ | wc -l) MIN_OBJCOPY_VERSION := 2.26.0 HAVE_VALID_OBJCOPY := $(shell objcopy -V | head -1 | awk -F ' ' '{print "$(MIN_OBJCOPY_VERSION)\n" $$NF}' | sort -ct. -k1,1n -k2,2n && echo "y") @@ -74,13 +75,14 @@ go.o: @# objcopy to make that symbol exportable. Since nasm does not support externs @# with slashes we create a global symbol alias for kernel.Kmain @echo "[objcopy] create kernel.Kmain alias to gopheros/kernel/kmain.Kmain" - @echo "[objcopy] globalizing symbols {_rt0_interrupt_handlers, runtime.g0/m0/physPageSize}" + @echo "[objcopy] globalizing symbols {_rt0_interrupt_handlers, runtime.g0/m0/physPageSize/useAVXmemmove}" @objcopy \ --add-symbol kernel.Kmain=.text:0x`nm $(BUILD_DIR)/go.o | grep "kmain.Kmain$$" | cut -d' ' -f1` \ --globalize-symbol _rt0_interrupt_handlers \ --globalize-symbol runtime.g0 \ --globalize-symbol runtime.m0 \ --globalize-symbol runtime.physPageSize \ + --globalize-symbol runtime.useAVXmemmove \ $(BUILD_DIR)/go.o $(BUILD_DIR)/go.o binutils_version_check: diff --git a/src/arch/x86_64/asm/rt0_64.s b/src/arch/x86_64/asm/rt0_64.s index fb42fc9..784331e 100644 --- a/src/arch/x86_64/asm/rt0_64.s +++ b/src/arch/x86_64/asm/rt0_64.s @@ -27,6 +27,14 @@ _rt0_irq_handlers resq IDT_ENTRIES r0_g_ptr: resq 1 tcb_ptr: resq 1 +; Go < 1.9 does not define runtime.useAVXmemmove; to avoid linker errors define +; a dummy symbol so that the gate entry code can work as expected. +%if WITH_RUNTIME_AVXMEMMOVE == 0 + runtime.useAVXmemmove resb 1 +%else + extern runtime.useAVXmemmove +%endif + section .text ;------------------------------------------------------------------------------ @@ -182,7 +190,7 @@ _rt0_64_gate_entry_%+ gate_num: %assign gate_num gate_num+1 %endrep -%macro save_regs 0 +%macro save_gp_regs 0 push r15 push r14 push r13 @@ -200,7 +208,7 @@ _rt0_64_gate_entry_%+ gate_num: push rax %endmacro -%macro restore_regs 0 +%macro restore_gp_regs 0 pop rax pop rbx pop rcx @@ -218,97 +226,163 @@ _rt0_64_gate_entry_%+ gate_num: pop r15 %endmacro +%macro save_xmm_regs 0 + sub rsp, 16*16 + movdqu [rsp+0*16], xmm0 + movdqu [rsp+1*16], xmm1 + movdqu [rsp+2*16], xmm2 + movdqu [rsp+3*16], xmm3 + movdqu [rsp+4*16], xmm4 + movdqu [rsp+5*16], xmm5 + movdqu [rsp+6*16], xmm6 + movdqu [rsp+7*16], xmm7 + movdqu [rsp+8*16], xmm8 + movdqu [rsp+9*16], xmm9 + movdqu [rsp+10*16], xmm10 + movdqu [rsp+11*16], xmm11 + movdqu [rsp+12*16], xmm12 + movdqu [rsp+13*16], xmm13 + movdqu [rsp+14*16], xmm14 + movdqu [rsp+15*16], xmm15 +%endmacro + +%macro restore_xmm_regs 0 + movdqu xmm0, [rsp+0*16] + movdqu xmm1, [rsp+1*16] + movdqu xmm2, [rsp+2*16] + movdqu xmm3, [rsp+3*16] + movdqu xmm4, [rsp+4*16] + movdqu xmm5, [rsp+5*16] + movdqu xmm6, [rsp+6*16] + movdqu xmm7, [rsp+7*16] + movdqu xmm8, [rsp+8*16] + movdqu xmm9, [rsp+9*16] + movdqu xmm10, [rsp+10*16] + movdqu xmm11, [rsp+11*16] + movdqu xmm12, [rsp+12*16] + movdqu xmm13, [rsp+13*16] + movdqu xmm14, [rsp+14*16] + movdqu xmm15, [rsp+15*16] + add rsp, 16*16 +%endmacro + + ;------------------------------------------------------------------------------ ; This dispatcher is invoked by gate entries that expect a code to be pushed -; by the CPU to the stack. It performs the following functions: -; - save registers -; - push pointer to saved regs -; - push pointer to stack frame -; - read and push exception code -; - invoke handler(code, &frame, ®s) -; - restore registers -; - pop exception code from stack so rsp points to the stack frame +; by the CPU to the stack. +; +; This is the stack layout used by this function. Items are 8-bytes +; wide with the exception of the xmm regs that are 16 bytes wide +; +; ----------------| +; useAVXmemmove | <- original value of runtime.useAVXmemmove +;-----------------| +; xmm regs (16) | +;-----------------| <- RBP will point at the last pushed GP reg +; gp regs (15) | +;-----------------| +; handler address | <- pushed by gate_entry_xxx (RSP initially points here) +;-----------------| +; exception code | <- pushed by CPU (must be popped before returning) +;-----------------| +; RIP | <- pushed by CPU (exception frame) +; CS | +; RFLAGS | +; RSP | +; SS | +;----------------- ;------------------------------------------------------------------------------ _rt0_64_gate_dispatcher_with_code: - ; This is how the stack looks like when entering this function: - ; (each item is 8-bytes wide) - ; - ;------------------ - ; handler address | <- pushed by gate_entry_xxx (RSP points here) - ;-----------------| - ; Exception code | <- needs to be removed from stack before calling iretq - ;-----------------| - ; RIP | <- exception frame - ; CS | - ; RFLAGS | - ; RSP | - ; SS | - ;----------------- cld - ; save regs and push a pointer to them - save_regs - mov rax, rsp ; rax points to saved rax - push rax ; push pointer to saved regs + ; save general-purpose regs + save_gp_regs + mov rbp, rsp ; rbp points to saved rax - ; push pointer to exception stack frame (we have used 15 qwords for the - ; saved registers plus one qword for the data pushed by the gate entry - ; plus one extra qword to jump over the exception code) - add rax, 17*8 - push rax + ; save xmm regs as the fault handler may clobber them by calling an + ; SSE-enabled runtime function like copy (calls runtime.memmove). In + ; addition temporarily disable AVX support for runtime.memmove so we + ; don't need to also preserve the avx regs. + save_xmm_regs + mov rax, runtime.useAVXmemmove + push qword [rax] + mov byte [rax], 0 - ; push exception code (located between the stack frame and the saved regs) - sub rax, 8 - push qword [rax] + ; push exception handler args and call registered handler + push qword [rbp] ; ptr to regs + push qword [rbp+17*8] ; ptr to exception frame + push qword [rbp+16*8] ; exception code + call qword [rbp+15*8] + add rsp, 3 * 8 - call [rsp + 18*8] ; call registered irq handler + ; restore xmm regs and restore AVX support for runtime.memmove + mov rax, runtime.useAVXmemmove + pop rbx + mov byte [rax], bl + restore_xmm_regs - add rsp, 3 * 8 ; unshift the pushed arguments so rsp points to the saved regs - restore_regs + ; restore general purpose regs + restore_gp_regs - add rsp, 16 ; pop handler address and exception code off the stack before returning + ; pop handler address + exception code so RSP points to the stack frame. + add rsp, 2*8 iretq ;------------------------------------------------------------------------------ ; This dispatcher is invoked by gate entries that do not use exception codes. -; It performs the following functions: -; - save registers -; - push pointer to saved regs -; - push pointer to stack frame -; - invoke handler(&frame, ®s) -; - restore registers +; +; This is the stack layout used by this function. Items are 8-bytes +; wide with the exception of the xmm regs that are 16 bytes wide +; +; ----------------| +; useAVXmemmove | <- original value of runtime.useAVXmemmove +;-----------------| +; xmm regs (16) | +;-----------------| <- RBP will point at the last pushed GP reg +; gp regs (15) | +;-----------------| +; handler address | <- pushed by gate_entry_xxx (RSP initially points here) +;-----------------| +; RIP | <- pushed by CPU (exception frame) +; CS | +; RFLAGS | +; RSP | +; SS | +;----------------- ;------------------------------------------------------------------------------ _rt0_64_gate_dispatcher_without_code: - ; This is how the stack looks like when entering this function: - ; (each item is 8-bytes wide) - ; - ;------------------ - ; handler address | <- pushed by gate_entry_xxx (RSP points here) - ;-----------------| - ; RIP | <- exception frame - ; CS | - ; RFLAGS | - ; RSP | - ; SS | - ;----------------- cld - ; save regs and push a pointer to them - save_regs - mov rax, rsp ; rax points to saved rax - push rax ; push pointer to saved regs + ; save general-purpose regs + save_gp_regs + mov rbp, rsp ; rbp points to saved rax - ; push pointer to exception stack frame (we have used 15 qwords for the - ; saved registers plus one qword for the data pushed by the gate entry) - add rax, 16*8 - push rax + ; save xmm regs as the fault handler may clobber them by calling an + ; SSE-enabled runtime function like copy (calls runtime.memmove). In + ; addition temporarily disable AVX support for runtime.memmove so we + ; don't need to also preserve the avx regs. + save_xmm_regs + mov rax, runtime.useAVXmemmove + push qword [rax] + mov byte [rax], 0 - call [rsp + 17*8] ; call registered irq handler + ; push exception handler args and call registered handler + push qword [rbp] ; ptr to regs + push qword [rbp+16*8] ; ptr to exception frame + call qword [rbp+15*8] + add rsp, 2 * 8 - add rsp, 2 * 8 ; unshift the pushed arguments so rsp points to the saved regs - restore_regs + ; restore xmm regs and restore AVX support for runtime.memmove + mov rax, runtime.useAVXmemmove + pop rbx + mov byte [rax], bl + restore_xmm_regs - add rsp, 8 ; pop handler address off the stack before returning + ; restore general purpose regs + restore_gp_regs + + ; pop handler address so RSP points to the stack frame. + add rsp, 8 iretq ;------------------------------------------------------------------------------ diff --git a/src/gopheros/kernel/kfmt/ringbuf.go b/src/gopheros/kernel/kfmt/ringbuf.go index c377dbc..2bd5baa 100644 --- a/src/gopheros/kernel/kfmt/ringbuf.go +++ b/src/gopheros/kernel/kfmt/ringbuf.go @@ -40,9 +40,10 @@ func (rb *ringBuffer) Read(p []byte) (n int, err error) { n = pLen } - for i := 0; i < n; i, rb.rIndex = i+1, rb.rIndex+1 { - p[i] = rb.buffer[rb.rIndex] - } + copy(p, rb.buffer[rb.rIndex:rb.rIndex+n]) + rb.rIndex += n + + return n, nil case rb.rIndex > rb.wIndex: // Read up to min(len(buf) - rIndex, len(p)) bytes n = len(rb.buffer) - rb.rIndex @@ -50,17 +51,15 @@ func (rb *ringBuffer) Read(p []byte) (n int, err error) { n = pLen } - for i := 0; i < n; i, rb.rIndex = i+1, rb.rIndex+1 { - p[i] = rb.buffer[rb.rIndex] - } + copy(p, rb.buffer[rb.rIndex:rb.rIndex+n]) + rb.rIndex += n if rb.rIndex == len(rb.buffer) { rb.rIndex = 0 } + return n, nil default: // rIndex == wIndex - n, err = 0, io.EOF + return 0, io.EOF } - - return n, err }