From ab0db651af6f1ffa8fe96909ce16ae314d65c3fb Mon Sep 17 00:00:00 2001 From: Frans Kaashoek Date: Sun, 23 Sep 2018 08:24:42 -0400 Subject: [PATCH] Checkpoint port of xv6 to x86-64. Passed usertests on 2 processors a few times. The x86-64 doesn't just add two levels to page tables to support 64 bit addresses, but is a different processor. For example, calling conventions, system calls, and segmentation are different from 32-bit x86. Segmentation is basically gone, but gs/fs in combination with MSRs can be used to hold a per-core pointer. In general, x86-64 is more straightforward than 32-bit x86. The port uses code from sv6 and the xv6 "rsc-amd64" branch. A summary of the changes is as follows: - Booting: switch to grub instead of xv6's bootloader (pass -kernel to qemu), because xv6's boot loader doesn't understand 64bit ELF files. And, we don't care anymore about booting. - Makefile: use -m64 instead of -m32 flag for gcc, delete boot loader, xv6.img, bochs, and memfs. For now dont' use -O2, since usertests with -O2 is bigger than MAXFILE! - Update gdb.tmpl to be for i386 or x86-64 - Console/printf: use stdarg.h and treat 64-bit addresses different from ints (32-bit) - Update elfhdr to be 64 bit - entry.S/entryother.S: add code to switch to 64-bit mode: build a simple page table in 32-bit mode before switching to 64-bit mode, share code for entering boot processor and APs, and tweak boot gdt. The boot gdt is the gdt that the kernel proper also uses. (In 64-bit mode, the gdt/segmentation and task state mostly disappear.) - exec.c: fix passing argv (64-bit now instead of 32-bit). - initcode.c: use syscall instead of int. - kernel.ld: load kernel very high, in top terabyte. 64 bits is a lot of address space! - proc.c: initial return is through new syscall path instead of trapret. - proc.h: update struct cpu to have some scratch space since syscall saves less state than int, update struct context to reflect x86-64 calling conventions. - swtch: simplify for x86-64 calling conventions. - syscall: add fetcharg to handle x86-64 calling convetions (6 arguments are passed through registers), and fetchaddr to read a 64-bit value from user space. - sysfile: update to handle pointers from user space (e.g., sys_exec), which are 64 bits. - trap.c: no special trap vector for sys calls, because x86-64 has a different plan for system calls. - trapasm: one plan for syscalls and one plan for traps (interrupt and exceptions). On x86-64, the kernel is responsible for switching user/kernel stacks. To do, xv6 keeps some scratch space in the cpu structure, and uses MSR GS_KERN_BASE to point to the core's cpu structure (using swapgs). - types.h: add uint64, and change pde_t to uint64 - usertests: exit() when fork fails, which helped in tracking down one of the bugs in the switch from 32-bit to 64-bit - vectors: update to make them 64 bits - vm.c: use bootgdt in kernel too, program MSRs for syscalls and core-local state (for swapgs), walk 4 levels in walkpgdir, add DEVSPACETOP, use task segment to set kernel stack for interrupts (but simpler than in 32-bit mode), add an extra argument to freevm (size of user part of address space) to avoid checking all entries till KERNBASE (there are MANY TB before the top 1TB). - x86: update trapframe to have 64-bit entries, which is what the processor pushes on syscalls and traps. simplify lgdt and lidt, using struct desctr, which needs the gcc directives packed and aligned. TODO: - use int32 instead of int? - simplify curproc(). xv6 has per-cpu state again, but this time it must have it. - avoid repetition in walkpgdir - fix validateint() in usertests.c - fix bugs (e.g., observed one a case of entering kernel with invalid gs or proc --- .gdbinit.tmpl | 27 ----- .gdbinit.tmpl-i386 | 5 + .gdbinit.tmpl-x64 | 18 ++++ Makefile | 79 ++++---------- bootasm.S | 88 --------------- console.c | 30 ++++-- defs.h | 12 ++- elf.h | 22 ++-- entry.S | 261 ++++++++++++++++++++++++++++++++++++--------- entryother.S | 57 +++------- exec.c | 30 +++--- initcode.S | 13 ++- ioapic.c | 3 +- kalloc.c | 6 +- kernel.ld | 41 ++----- main.c | 69 +++++------- memlayout.h | 7 +- mmu.h | 232 +++++++++++++++++++--------------------- mp.c | 6 +- mp.h | 8 +- msr.h | 25 +++++ printf.c | 34 +++--- proc.c | 34 +++--- proc.h | 21 ++-- spinlock.c | 10 +- spinlock.h | 2 +- string.c | 2 +- swtch.S | 36 ++++--- syscall.c | 58 ++++++++-- sysfile.c | 6 +- trap.c | 29 +++-- trapasm.S | 150 ++++++++++++++++++++++---- traps.h | 1 + types.h | 8 +- usertests.c | 38 +++++-- usys.S | 2 +- vectors.pl | 16 +-- vm.c | 211 +++++++++++++++++++++++++----------- x86.h | 104 ++++++++---------- 39 files changed, 1039 insertions(+), 762 deletions(-) delete mode 100644 .gdbinit.tmpl create mode 100644 .gdbinit.tmpl-i386 create mode 100644 .gdbinit.tmpl-x64 delete mode 100644 bootasm.S create mode 100644 msr.h diff --git a/.gdbinit.tmpl b/.gdbinit.tmpl deleted file mode 100644 index f71681a..0000000 --- a/.gdbinit.tmpl +++ /dev/null @@ -1,27 +0,0 @@ -set $lastcs = -1 - -define hook-stop - # There doesn't seem to be a good way to detect if we're in 16- or - # 32-bit mode, but in 32-bit mode we always run with CS == 8 in the - # kernel and CS == 35 in user space - if $cs == 8 || $cs == 35 - if $lastcs != 8 && $lastcs != 35 - set architecture i386 - end - x/i $pc - else - if $lastcs == -1 || $lastcs == 8 || $lastcs == 35 - set architecture i8086 - end - # Translate the segment:offset into a physical address - printf "[%4x:%4x] ", $cs, $eip - x/i $cs*16+$eip - end - set $lastcs = $cs -end - -echo + target remote localhost:1234\n -target remote localhost:1234 - -echo + symbol-file kernel\n -symbol-file kernel diff --git a/.gdbinit.tmpl-i386 b/.gdbinit.tmpl-i386 new file mode 100644 index 0000000..f4f85d2 --- /dev/null +++ b/.gdbinit.tmpl-i386 @@ -0,0 +1,5 @@ +python +gdb.execute("target remote localhost:26000") +gdb.execute("set architecture i386") +gdb.execute("symbol-file kernel") +gdb.execute("break *0x7c00") diff --git a/.gdbinit.tmpl-x64 b/.gdbinit.tmpl-x64 new file mode 100644 index 0000000..9c120ff --- /dev/null +++ b/.gdbinit.tmpl-x64 @@ -0,0 +1,18 @@ +#if you would like to use gdb in 32bit mode, comment out lines 8 and 15, then uncomment +#the lines after. Note this will only work properly until 64bit mode is enabled in entry.S + +python +gdb.execute("set architecture i386:x86-64:intel") +gdb.execute("target remote localhost:26000") +gdb.execute("symbol-file kernel") +gdb.execute("break start64") +#gdb.execute("break *0x7c00") +try: + gdb.execute("continue") +except: + pass +gdb.execute("disconnect") +gdb.execute("set architecture i386:x86-64") +#gdb.execute("set architecture i386") +gdb.execute("target remote localhost:26000") +gdb.execute("delete break 1") diff --git a/Makefile b/Makefile index 09d790c..b199842 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,7 @@ TOOLPREFIX := $(shell if i386-jos-elf-objdump -i 2>&1 | grep '^elf32-i386$$' >/d endif # If the makefile can't find QEMU, specify its path here -# QEMU = qemu-system-i386 +QEMU = qemu-system-x86_64 # Try to infer the correct QEMU ifndef QEMU @@ -76,11 +76,16 @@ AS = $(TOOLPREFIX)gas LD = $(TOOLPREFIX)ld OBJCOPY = $(TOOLPREFIX)objcopy OBJDUMP = $(TOOLPREFIX)objdump -CFLAGS = -fno-pic -static -fno-builtin -fno-strict-aliasing -O2 -Wall -MD -ggdb -m32 -Werror -fno-omit-frame-pointer + +XFLAGS = -m64 -mcmodel=large -ggdb +# CFLAGS = -fno-pic -static -fno-builtin -fno-strict-aliasing -O2 -Wall -MD -ggdb -Werror -fno-omit-frame-pointer +CFLAGS = -fno-pic -static -fno-builtin -fno-strict-aliasing -Wall -MD -ggdb -Werror -fno-omit-frame-pointer +CFLAGS += -ffreestanding -fno-common -nostdlib $(XFLAGS) CFLAGS += $(shell $(CC) -fno-stack-protector -E -x c /dev/null >/dev/null 2>&1 && echo -fno-stack-protector) -ASFLAGS = -m32 -gdwarf-2 -Wa,-divide +ASFLAGS = -gdwarf-2 -Wa,-divide $(XFLAGS) # FreeBSD ld wants ``elf_i386_fbsd'' -LDFLAGS += -m $(shell $(LD) -V | grep elf_i386 2>/dev/null | head -n 1) +LDFLAGS += -m $(shell $(LD) -V | grep elf_x86_64 2>/dev/null | head -n 1) +LDFLAGS += -z max-page-size=4096 # Disable PIE when possible (for Ubuntu 16.10 toolchain) ifneq ($(shell $(CC) -dumpspecs 2>/dev/null | grep -e '[^f]no-pie'),) @@ -90,23 +95,10 @@ ifneq ($(shell $(CC) -dumpspecs 2>/dev/null | grep -e '[^f]nopie'),) CFLAGS += -fno-pie -nopie endif -xv6.img: bootblock kernel - dd if=/dev/zero of=xv6.img count=10000 - dd if=bootblock of=xv6.img conv=notrunc - dd if=kernel of=xv6.img seek=1 conv=notrunc - -xv6memfs.img: bootblock kernelmemfs - dd if=/dev/zero of=xv6memfs.img count=10000 - dd if=bootblock of=xv6memfs.img conv=notrunc - dd if=kernelmemfs of=xv6memfs.img seek=1 conv=notrunc - -bootblock: bootasm.S bootmain.c - $(CC) $(CFLAGS) -fno-pic -O -nostdinc -I. -c bootmain.c - $(CC) $(CFLAGS) -fno-pic -nostdinc -I. -c bootasm.S - $(LD) $(LDFLAGS) -N -e start -Ttext 0x7C00 -o bootblock.o bootasm.o bootmain.o - $(OBJDUMP) -S bootblock.o > bootblock.asm - $(OBJCOPY) -S -O binary -j .text bootblock.o bootblock - ./sign.pl bootblock +kernel: $(OBJS) entry.o entryother initcode kernel.ld + $(LD) $(LDFLAGS) -T kernel.ld -o kernel entry.o $(OBJS) -b binary initcode entryother + $(OBJDUMP) -S kernel > kernel.asm + $(OBJDUMP) -t kernel | sed '1,/SYMBOL TABLE/d; s/ .* / /; /^$$/d' > kernel.sym entryother: entryother.S $(CC) $(CFLAGS) -fno-pic -nostdinc -I. -c entryother.S @@ -120,23 +112,6 @@ initcode: initcode.S $(OBJCOPY) -S -O binary initcode.out initcode $(OBJDUMP) -S initcode.o > initcode.asm -kernel: $(OBJS) entry.o entryother initcode kernel.ld - $(LD) $(LDFLAGS) -T kernel.ld -o kernel entry.o $(OBJS) -b binary initcode entryother - $(OBJDUMP) -S kernel > kernel.asm - $(OBJDUMP) -t kernel | sed '1,/SYMBOL TABLE/d; s/ .* / /; /^$$/d' > kernel.sym - -# kernelmemfs is a copy of kernel that maintains the -# disk image in memory instead of writing to a disk. -# This is not so useful for testing persistent storage or -# exploring disk buffering implementations, but it is -# great for testing the kernel on real hardware without -# needing a scratch disk. -MEMFSOBJS = $(filter-out ide.o,$(OBJS)) memide.o -kernelmemfs: $(MEMFSOBJS) entry.o entryother initcode kernel.ld fs.img - $(LD) $(LDFLAGS) -T kernel.ld -o kernelmemfs entry.o $(MEMFSOBJS) -b binary initcode entryother fs.img - $(OBJDUMP) -S kernelmemfs > kernelmemfs.asm - $(OBJDUMP) -t kernelmemfs | sed '1,/SYMBOL TABLE/d; s/ .* / /; /^$$/d' > kernelmemfs.sym - tags: $(OBJS) entryother.S _init etags *.S *.c @@ -190,8 +165,8 @@ fs.img: mkfs README $(UPROGS) clean: rm -f *.tex *.dvi *.idx *.aux *.log *.ind *.ilg \ *.o *.d *.asm *.sym vectors.S bootblock entryother \ - initcode initcode.out kernel xv6.img fs.img kernelmemfs \ - xv6memfs.img mkfs .gdbinit \ + initcode initcode.out kernel fs.img kernelmemfs \ + mkfs .gdbinit \ $(UPROGS) # make a printout @@ -204,12 +179,6 @@ xv6.pdf: $(PRINT) print: xv6.pdf -# run in emulators - -bochs : fs.img xv6.img - if [ ! -e .bochsrc ]; then ln -s dot-bochsrc .bochsrc; fi - bochs -q - # try to generate a unique GDB port GDBPORT = $(shell expr `id -u` % 5000 + 25000) # QEMU's gdb stub command line changed in 0.11 @@ -219,25 +188,21 @@ QEMUGDB = $(shell if $(QEMU) -help | grep -q '^-gdb'; \ ifndef CPUS CPUS := 2 endif -QEMUOPTS = -drive file=fs.img,index=1,media=disk,format=raw -drive file=xv6.img,index=0,media=disk,format=raw -smp $(CPUS) -m 512 $(QEMUEXTRA) - -qemu: fs.img xv6.img +QEMUOPTS = -kernel kernel -drive file=fs.img,index=1,media=disk,format=raw -smp $(CPUS) -m 512 $(QEMUEXTRA) +qemu: fs.img $(QEMU) -serial mon:stdio $(QEMUOPTS) -qemu-memfs: xv6memfs.img - $(QEMU) -drive file=xv6memfs.img,index=0,media=disk,format=raw -smp $(CPUS) -m 256 - -qemu-nox: fs.img xv6.img +qemu-nox: fs.img kernel $(QEMU) -nographic $(QEMUOPTS) -.gdbinit: .gdbinit.tmpl +.gdbinit: .gdbinit.tmpl-x64 sed "s/localhost:1234/localhost:$(GDBPORT)/" < $^ > $@ -qemu-gdb: fs.img xv6.img .gdbinit +qemu-gdb: fs.img kernel .gdbinit @echo "*** Now run 'gdb'." 1>&2 - $(QEMU) -serial mon:stdio $(QEMUOPTS) -S $(QEMUGDB) + $(QEMU) $(QEMUOPTS) -S $(QEMUGDB) -qemu-nox-gdb: fs.img xv6.img .gdbinit +qemu-nox-gdb: fs.img kernel .gdbinit @echo "*** Now run 'gdb'." 1>&2 $(QEMU) -nographic $(QEMUOPTS) -S $(QEMUGDB) diff --git a/bootasm.S b/bootasm.S deleted file mode 100644 index 257867c..0000000 --- a/bootasm.S +++ /dev/null @@ -1,88 +0,0 @@ -#include "asm.h" -#include "memlayout.h" -#include "mmu.h" - -# Start the first CPU: switch to 32-bit protected mode, jump into C. -# The BIOS loads this code from the first sector of the hard disk into -# memory at physical address 0x7c00 and starts executing in real mode -# with %cs=0 %ip=7c00. - -.code16 # Assemble for 16-bit mode -.globl start -start: - cli # BIOS enabled interrupts; disable - - # Zero data segment registers DS, ES, and SS. - xorw %ax,%ax # Set %ax to zero - movw %ax,%ds # -> Data Segment - movw %ax,%es # -> Extra Segment - movw %ax,%ss # -> Stack Segment - - # Physical address line A20 is tied to zero so that the first PCs - # with 2 MB would run software that assumed 1 MB. Undo that. -seta20.1: - inb $0x64,%al # Wait for not busy - testb $0x2,%al - jnz seta20.1 - - movb $0xd1,%al # 0xd1 -> port 0x64 - outb %al,$0x64 - -seta20.2: - inb $0x64,%al # Wait for not busy - testb $0x2,%al - jnz seta20.2 - - movb $0xdf,%al # 0xdf -> port 0x60 - outb %al,$0x60 - - # Switch from real to protected mode. Use a bootstrap GDT that makes - # virtual addresses map directly to physical addresses so that the - # effective memory map doesn't change during the transition. - lgdt gdtdesc - movl %cr0, %eax - orl $CR0_PE, %eax - movl %eax, %cr0 - -//PAGEBREAK! - # Complete the transition to 32-bit protected mode by using a long jmp - # to reload %cs and %eip. The segment descriptors are set up with no - # translation, so that the mapping is still the identity mapping. - ljmp $(SEG_KCODE<<3), $start32 - -.code32 # Tell assembler to generate 32-bit code now. -start32: - # Set up the protected-mode data segment registers - movw $(SEG_KDATA<<3), %ax # Our data segment selector - movw %ax, %ds # -> DS: Data Segment - movw %ax, %es # -> ES: Extra Segment - movw %ax, %ss # -> SS: Stack Segment - movw $0, %ax # Zero segments not ready for use - movw %ax, %fs # -> FS - movw %ax, %gs # -> GS - - # Set up the stack pointer and call into C. - movl $start, %esp - call bootmain - - # If bootmain returns (it shouldn't), trigger a Bochs - # breakpoint if running under Bochs, then loop. - movw $0x8a00, %ax # 0x8a00 -> port 0x8a00 - movw %ax, %dx - outw %ax, %dx - movw $0x8ae0, %ax # 0x8ae0 -> port 0x8a00 - outw %ax, %dx -spin: - jmp spin - -# Bootstrap GDT -.p2align 2 # force 4 byte alignment -gdt: - SEG_NULLASM # null seg - SEG_ASM(STA_X|STA_R, 0x0, 0xffffffff) # code seg - SEG_ASM(STA_W, 0x0, 0xffffffff) # data seg - -gdtdesc: - .word (gdtdesc - gdt - 1) # sizeof(gdt) - 1 - .long gdt # address gdt - diff --git a/console.c b/console.c index a280d2b..9986a9c 100644 --- a/console.c +++ b/console.c @@ -2,6 +2,8 @@ // Input is from the keyboard or serial port. // Output is written to the screen and serial port. +#include + #include "types.h" #include "defs.h" #include "param.h" @@ -24,10 +26,11 @@ static struct { int locking; } cons; +static char digits[] = "0123456789abcdef"; + static void printint(int xx, int base, int sign) { - static char digits[] = "0123456789abcdef"; char buf[16]; int i; uint x; @@ -48,14 +51,25 @@ printint(int xx, int base, int sign) while(--i >= 0) consputc(buf[i]); } + +static void +printptr(uint64 x) { + int i; + consputc('0'); + consputc('x'); + for (i = 0; i < (sizeof(uint64) * 2); i++, x <<= 4) + consputc(digits[x >> (sizeof(uint64) * 8 - 4)]); +} + + //PAGEBREAK: 50 // Print to the console. only understands %d, %x, %p, %s. void cprintf(char *fmt, ...) { + va_list ap; int i, c, locking; - uint *argp; char *s; locking = cons.locking; @@ -65,7 +79,7 @@ cprintf(char *fmt, ...) if (fmt == 0) panic("null fmt"); - argp = (uint*)(void*)(&fmt + 1); + va_start(ap, fmt); for(i = 0; (c = fmt[i] & 0xff) != 0; i++){ if(c != '%'){ consputc(c); @@ -76,14 +90,16 @@ cprintf(char *fmt, ...) break; switch(c){ case 'd': - printint(*argp++, 10, 1); + printint(va_arg(ap, int), 10, 1); break; case 'x': + printint(va_arg(ap, int), 16, 1); + break; case 'p': - printint(*argp++, 16, 0); + printptr(va_arg(ap, uint64)); break; case 's': - if((s = (char*)*argp++) == 0) + if((s = va_arg(ap, char*)) == 0) s = "(null)"; for(; *s; s++) consputc(*s); @@ -107,7 +123,7 @@ void panic(char *s) { int i; - uint pcs[10]; + uint64 pcs[10]; cli(); cons.locking = 0; diff --git a/defs.h b/defs.h index 82fb982..fd9ecb4 100644 --- a/defs.h +++ b/defs.h @@ -126,7 +126,7 @@ void swtch(struct context**, struct context*); // spinlock.c void acquire(struct spinlock*); -void getcallerpcs(void*, uint*); +void getcallerpcs(void*, uint64*); int holding(struct spinlock*); void initlock(struct spinlock*, char*); void release(struct spinlock*); @@ -152,8 +152,10 @@ char* strncpy(char*, const char*, int); int argint(int, int*); int argptr(int, char**, int); int argstr(int, char**); -int fetchint(uint, int*); -int fetchstr(uint, char**); +int argaddr(int, uint64 *); +int fetchint(uint64, int*); +int fetchstr(uint64, char**); +int fetchaddr(uint64, uint64*); void syscall(void); // timer.c @@ -176,8 +178,8 @@ void kvmalloc(void); pde_t* setupkvm(void); char* uva2ka(pde_t*, char*); int allocuvm(pde_t*, uint, uint); -int deallocuvm(pde_t*, uint, uint); -void freevm(pde_t*); +int deallocuvm(pde_t*, uint64, uint64); +void freevm(pde_t*, uint64); void inituvm(pde_t*, char*, uint); int loaduvm(pde_t*, char*, struct inode*, uint, uint); pde_t* copyuvm(pde_t*, uint); diff --git a/elf.h b/elf.h index d16c967..84555fa 100644 --- a/elf.h +++ b/elf.h @@ -9,9 +9,9 @@ struct elfhdr { ushort type; ushort machine; uint version; - uint entry; - uint phoff; - uint shoff; + uint64 entry; + uint64 phoff; + uint64 shoff; uint flags; ushort ehsize; ushort phentsize; @@ -23,14 +23,14 @@ struct elfhdr { // Program section header struct proghdr { - uint type; - uint off; - uint vaddr; - uint paddr; - uint filesz; - uint memsz; - uint flags; - uint align; + uint32 type; + uint32 flags; + uint64 off; + uint64 vaddr; + uint64 paddr; + uint64 filesz; + uint64 memsz; + uint64 align; }; // Values for Proghdr type diff --git a/entry.S b/entry.S index bc79bab..88ad92b 100644 --- a/entry.S +++ b/entry.S @@ -1,68 +1,223 @@ -# The xv6 kernel starts executing in this file. This file is linked with -# the kernel C code, so it can refer to kernel symbols such as main(). -# The boot block (bootasm.S and bootmain.c) jumps to entry below. - -# Multiboot header, for multiboot boot loaders like GNU Grub. +# x86-64 bootstrap, assuming load by MultiBoot-compliant loader. +# The MutliBoot specification is at: # http://www.gnu.org/software/grub/manual/multiboot/multiboot.html -# -# Using GRUB 2, you can boot xv6 from a file stored in a -# Linux file system by copying kernel or kernelmemfs to /boot -# and then adding this menu entry: -# -# menuentry "xv6" { -# insmod ext2 -# set root='(hd0,msdos1)' -# set kernel='/boot/kernel' -# echo "Loading ${kernel}..." -# multiboot ${kernel} ${kernel} -# boot -# } +# GRUB is a MultiBoot loader, as is qemu's -kernel option. -#include "asm.h" -#include "memlayout.h" #include "mmu.h" -#include "param.h" +#include "memlayout.h" -# Multiboot header. Data to direct multiboot loader. -.p2align 2 +# STACK is the size of the bootstrap stack. +#define STACK 8192 + +# MultiBoot header. +# http://www.gnu.org/software/grub/manual/multiboot/multiboot.html#Header-layout +.align 4 .text .globl multiboot_header multiboot_header: #define magic 0x1badb002 - #define flags 0 + #define flags (1<<16 | 1<<0) .long magic .long flags - .long (-magic-flags) + .long (- magic - flags) # checksum + .long V2P_WO(multiboot_header) # header address + .long V2P_WO(multiboot_header) # load address + .long V2P_WO(edata) # load end address + .long V2P_WO(end) # bss end address + .long V2P_WO(start) # entry address -# By convention, the _start symbol specifies the ELF entry point. -# Since we haven't set up virtual memory yet, our entry point is -# the physical address of 'entry'. -.globl _start -_start = V2P_WO(entry) +# Entry point jumped to by boot loader. Running in 32-bit mode. +# http://www.gnu.org/software/grub/manual/multiboot/multiboot.html#Machine-state +# +# EAX = 0x2badb002 +# EBX = address of multiboot information structure +# CS = 32-bit read/execute code segment with identity map +# DS, ES, FS, GS, SS = 32-bit read/write data segment with identity map +# A20 gate = enabled +# CR0 = PE set, PG clear +# EFLAGS = VM clear, IF clear +# +.code32 +.globl start +start: + # Tell BIOS to do "warm reboot" when we shut down. + movw $0x1234, 0x472 -# Entering xv6 on boot processor, with paging off. -.globl entry -entry: - # Turn on page size extension for 4Mbyte pages - movl %cr4, %eax - orl $(CR4_PSE), %eax - movl %eax, %cr4 - # Set page directory - movl $(V2P_WO(entrypgdir)), %eax - movl %eax, %cr3 - # Turn on paging. - movl %cr0, %eax - orl $(CR0_PG|CR0_WP), %eax - movl %eax, %cr0 + # Set up multiboot arguments for main. + movl %eax, %edi + movl %ebx, %esi - # Set up the stack pointer. - movl $(stack + KSTACKSIZE), %esp + # Initialize stack. + movl $V2P_WO(stack+STACK), %esp + + # Zero bss. QEMU's MultiBoot seems not to. + # It's possible that the header above is not right, but it looks right. + # %edi is holding multiboot argument, so save in another register. + # (The stack is in the bss.) + movl %edi, %edx + movl $V2P_WO(edata), %edi + movl $V2P_WO(end), %ecx + subl $V2P_WO(edata), %ecx + movl $0, %eax + cld + rep stosb + movl %edx, %edi - # Jump to main(), and switch to executing at - # high addresses. The indirect call is needed because - # the assembler produces a PC-relative instruction - # for a direct jump. - mov $main, %eax - jmp *%eax + call loadgdt + + # Enter new 32-bit code segment (already in 32-bit mode). + ljmp $KCSEG32, $V2P_WO(start32) // code32 segment selector + +start32: + # Initialize page table. + call initpagetables + call init32e + + movl $V2P_WO(start64), %eax + # Enter 64-bit mode. + ljmp $KCSEG, $V2P_WO(tramp64) // code64 segment selector -.comm stack, KSTACKSIZE +.code64 +start64: + # Load VA of stack + movabsq $(stack+STACK), %rsp + # Clear frame pointer for stack walks + movl $0, %ebp + # Call into C code. + call bpmain + # should not return from bpmain + jmp . + +.code32 +.global apstart +apstart: + call loadgdt + ljmp $KCSEG32, $V2P_WO(apstart32) // code32 segment selector + +apstart32: + call init32e + movl $V2P_WO(apstart64), %eax + ljmp $KCSEG, $V2P_WO(tramp64) // code64 segment selector + +.code64 +apstart64: + # Remember (from bootothers), that our kernel stack pointer is + # at the top of our temporary stack. + popq %rax + movq %rax, %rsp + movq $0, %rbp + call apmain +1: jmp 1b + +.code64 +tramp64: + # The linker thinks we are running at tramp64, but we're actually + # running at PADDR(tramp64), so use an explicit calculation to + # load and jump to the correct address. %rax should hold the + # physical address of the jmp target. + movq $KERNBASE, %r11 + addq %r11, %rax + jmp *%rax + +# Initial stack +.comm stack, STACK + +# Page tables. See section 4.5 of 253668.pdf. +# We map the first GB of physical memory at 0 and at 1 TB (not GB) before +# the end of virtual memory. At boot time we are using the mapping at 0 +# but during ordinary execution we use the high mapping. +# The intent is that after bootstrap the kernel can expand this mapping +# to cover all the available physical memory. +# This would be easier if we could use the PS bit to create GB-sized entries +# and skip the pdt table, but not all chips support it, and QEMU doesn't. +.align 4096 +pml4: + .quad V2P_WO(pdpt) + PTE_P + PTE_W // present, read/write + .quad 0 + .space 4096 - 2*16 + .quad V2P_WO(pdpt) + PTE_P + PTE_W + .quad 0 + +.align 4096 +pdpt: + .quad V2P_WO(pdt) + PTE_P + PTE_W + .space 4096 - 8 + +.align 4096 +pdt: + // Filled in below. + .space 4096 + +.code32 +initpagetables: + pushl %edi + pushl %ecx + pushl %eax + + // Set up 64-bit entry in %edx:%eax. + // Base address 0, present, read/write, large page. + movl $(0 | PTE_P | PTE_W | PTE_PS), %eax + movl $0, %edx + + // Fill in 512 entries at pdt. + movl $V2P_WO(pdt), %edi + movl $512, %ecx +1: + // Write this 64-bit entry. + movl %eax, 0(%edi) + movl %edx, 4(%edi) + addl $8, %edi + // 64-bit add to prepare address for next entry. + // Because this is a large page entry, it covers 512 4k pages (2 MB). + add $(512*4096), %eax + adc $0, %edx + loop 1b + + popl %eax + popl %ecx + popl %edi + ret + +# Initialize IA-32e mode. See section 9.8.5 of 253668.pdf. +init32e: + # Set CR4.PAE and CR4.PSE = 1. + movl %cr4, %eax + orl $0x30, %eax + movl %eax, %cr4 + + # Load CR3 with physical base address of level 4 page table. + movl $V2P_WO(pml4), %eax + movl %eax, %cr3 + + # Enable IA-32e mode by setting IA32_EFER.LME = 1. + # Also turn on IA32_EFER.SCE (syscall enable). + movl $0xc0000080, %ecx + rdmsr + orl $0x101, %eax + wrmsr + + # Enable paging by setting CR0.PG = 1. + movl %cr0, %eax + orl $0x80000000, %eax + movl %eax, %cr0 + nop + nop + + ret + +loadgdt: + subl $8, %esp + movl $V2P_WO(bootgdt), 4(%esp) + movw $(8*NSEGS-1), 2(%esp) + lgdt 2(%esp) + addl $8, %esp + + movl $KDSEG, %eax // data segment selector + movw %ax, %ds + movw %ax, %es + movw %ax, %ss + movl $0, %eax // null segment selector + movw %ax, %fs + movw %ax, %gs + + ret diff --git a/entryother.S b/entryother.S index a3b6dc2..3e502f3 100644 --- a/entryother.S +++ b/entryother.S @@ -13,11 +13,9 @@ # # Startothers (in main.c) sends the STARTUPs one at a time. # It copies this code (start) at 0x7000. It puts the address of -# a newly allocated per-core stack in start-4,the address of the -# place to jump to (mpenter) in start-8, and the physical address +# a newly allocated per-core stack in start-12,the address of the +# place to jump to (apstart32) in start-4, and the physical address # of entrypgdir in start-12. -# -# This code combines elements of bootasm.S and entry.S. .code16 .globl start @@ -41,53 +39,22 @@ start: # Complete the transition to 32-bit protected mode by using a long jmp # to reload %cs and %eip. The segment descriptors are set up with no # translation, so that the mapping is still the identity mapping. - ljmpl $(SEG_KCODE<<3), $(start32) + ljmpl $(KCSEG32), $start32 -//PAGEBREAK! -.code32 # Tell assembler to generate 32-bit code now. +.code32 start32: - # Set up the protected-mode data segment registers - movw $(SEG_KDATA<<3), %ax # Our data segment selector - movw %ax, %ds # -> DS: Data Segment - movw %ax, %es # -> ES: Extra Segment - movw %ax, %ss # -> SS: Stack Segment - movw $0, %ax # Zero segments not ready for use - movw %ax, %fs # -> FS - movw %ax, %gs # -> GS + movl $start-12, %esp + movl start-4, %ecx + jmp *%ecx - # Turn on page size extension for 4Mbyte pages - movl %cr4, %eax - orl $(CR4_PSE), %eax - movl %eax, %cr4 - # Use entrypgdir as our initial page table - movl (start-12), %eax - movl %eax, %cr3 - # Turn on paging. - movl %cr0, %eax - orl $(CR0_PE|CR0_PG|CR0_WP), %eax - movl %eax, %cr0 - - # Switch to the stack allocated by startothers() - movl (start-4), %esp - # Call mpenter() - call *(start-8) - - movw $0x8a00, %ax - movw %ax, %dx - outw %ax, %dx - movw $0x8ae0, %ax - outw %ax, %dx -spin: - jmp spin - -.p2align 2 +.align 4 gdt: SEG_NULLASM - SEG_ASM(STA_X|STA_R, 0, 0xffffffff) - SEG_ASM(STA_W, 0, 0xffffffff) - + SEG_ASM(0xa, 0, 0xffffffff) + SEG_ASM(0x2, 0, 0xffffffff) +.align 16 gdtdesc: - .word (gdtdesc - gdt - 1) + .word 0x17 # sizeof(gdt)-1 .long gdt diff --git a/exec.c b/exec.c index b40134f..b1a9229 100644 --- a/exec.c +++ b/exec.c @@ -4,6 +4,8 @@ #include "mmu.h" #include "proc.h" #include "defs.h" +#include "traps.h" +#include "msr.h" #include "x86.h" #include "elf.h" @@ -12,18 +14,18 @@ exec(char *path, char **argv) { char *s, *last; int i, off; - uint argc, sz, sp, ustack[3+MAXARG+1]; + uint64 argc, sz, sp, ustack[3+MAXARG+1]; struct elfhdr elf; struct inode *ip; struct proghdr ph; pde_t *pgdir, *oldpgdir; struct proc *curproc = myproc(); - + uint64 oldsz = curproc->sz; + begin_op(); if((ip = namei(path)) == 0){ end_op(); - cprintf("exec: fail\n"); return -1; } ilock(ip); @@ -72,7 +74,7 @@ exec(char *path, char **argv) for(argc = 0; argv[argc]; argc++) { if(argc >= MAXARG) goto bad; - sp = (sp - (strlen(argv[argc]) + 1)) & ~3; + sp = (sp - (strlen(argv[argc]) + 1)) & ~(sizeof(uint64)-1); if(copyout(pgdir, sp, argv[argc], strlen(argv[argc]) + 1) < 0) goto bad; ustack[3+argc] = sp; @@ -81,10 +83,13 @@ exec(char *path, char **argv) ustack[0] = 0xffffffff; // fake return PC ustack[1] = argc; - ustack[2] = sp - (argc+1)*4; // argv pointer + ustack[2] = sp - (argc+1)*sizeof(uint64); // argv pointer - sp -= (3+argc+1) * 4; - if(copyout(pgdir, sp, ustack, (3+argc+1)*4) < 0) + curproc->tf->rdi = argc; + curproc->tf->rsi = sp - (argc+1)*sizeof(uint64); + + sp -= (3+argc+1) * sizeof(uint64); + if(copyout(pgdir, sp, ustack, (3+argc+1)*sizeof(uint64)) < 0) goto bad; // Save program name for debugging. @@ -92,20 +97,21 @@ exec(char *path, char **argv) if(*s == '/') last = s+1; safestrcpy(curproc->name, last, sizeof(curproc->name)); - + // Commit to the user image. oldpgdir = curproc->pgdir; curproc->pgdir = pgdir; curproc->sz = sz; - curproc->tf->eip = elf.entry; // main - curproc->tf->esp = sp; + curproc->tf->rip = elf.entry; // main + curproc->tf->rcx = elf.entry; + curproc->tf->rsp = sp; switchuvm(curproc); - freevm(oldpgdir); + freevm(oldpgdir, oldsz); return 0; bad: if(pgdir) - freevm(pgdir); + freevm(pgdir, sz); if(ip){ iunlockput(ip); end_op(); diff --git a/initcode.S b/initcode.S index 80ac5d8..e097394 100644 --- a/initcode.S +++ b/initcode.S @@ -8,16 +8,15 @@ # exec(init, argv) .globl start start: - pushl $argv - pushl $init - pushl $0 // where caller pc would be - movl $SYS_exec, %eax - int $T_SYSCALL + mov $init, %rdi + mov $argv, %rsi + mov $SYS_exec, %rax + syscall # for(;;) exit(); exit: - movl $SYS_exit, %eax - int $T_SYSCALL + mov $SYS_exit, %rax + syscall jmp exit # char init[] = "/init\0"; diff --git a/ioapic.c b/ioapic.c index cb0f015..bbe5f9b 100644 --- a/ioapic.c +++ b/ioapic.c @@ -4,6 +4,7 @@ #include "types.h" #include "defs.h" +#include "memlayout.h" #include "traps.h" #define IOAPIC 0xFEC00000 // Default physical address of IO APIC @@ -50,7 +51,7 @@ ioapicinit(void) { int i, id, maxintr; - ioapic = (volatile struct ioapic*)IOAPIC; + ioapic = P2V((volatile struct ioapic*)IOAPIC); maxintr = (ioapicread(REG_VER) >> 16) & 0xFF; id = ioapicread(REG_ID) >> 24; if(id != ioapicid) diff --git a/kalloc.c b/kalloc.c index 14cd4f4..fb939b7 100644 --- a/kalloc.c +++ b/kalloc.c @@ -47,7 +47,7 @@ void freerange(void *vstart, void *vend) { char *p; - p = (char*)PGROUNDUP((uint)vstart); + p = (char*)PGROUNDUP((uint64)vstart); for(; p + PGSIZE <= (char*)vend; p += PGSIZE) kfree(p); } @@ -61,7 +61,7 @@ kfree(char *v) { struct run *r; - if((uint)v % PGSIZE || v < end || V2P(v) >= PHYSTOP) + if((uint64)v % PGSIZE || v < end || V2P(v) >= PHYSTOP) panic("kfree"); // Fill with junk to catch dangling refs. @@ -91,6 +91,8 @@ kalloc(void) kmem.freelist = r->next; if(kmem.use_lock) release(&kmem.lock); + if(r != 0 && (uint64) r < KERNBASE) + panic("kalloc"); return (char*)r; } diff --git a/kernel.ld b/kernel.ld index e24c860..e78fd38 100644 --- a/kernel.ld +++ b/kernel.ld @@ -1,22 +1,13 @@ -/* Simple linker script for the JOS kernel. - See the GNU ld 'info' manual ("info ld") to learn the syntax. */ - -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(_start) +OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") +OUTPUT_ARCH(i386:x86-64) SECTIONS { - /* Link the kernel at this address: "." means the current address */ - /* Must be equal to KERNLINK */ - . = 0x80100000; - + . = 0xFFFFFF0000100000; + PROVIDE(text = .); .text : AT(0x100000) { *(.text .stub .text.* .gnu.linkonce.t.*) } - - PROVIDE(etext = .); /* Define the 'etext' symbol to this value */ - .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } @@ -38,31 +29,21 @@ SECTIONS for this section */ } - /* Adjust the address for the data segment to the next page */ . = ALIGN(0x1000); - /* Conventionally, Unix linkers provide pseudo-symbols - * etext, edata, and end, at the end of the text, data, and bss. - * For the kernel mapping, we need the address at the beginning - * of the data section, but that's not one of the conventional - * symbols, because the convention started before there was a - * read-only rodata section between text and data. */ - PROVIDE(data = .); - - /* The data segment */ + /* Conventionally, Unix linkers provide pseudo-symbols + * etext, edata, and end, at the end of the text, data, and bss. + * For the kernel mapping, we need the address at the beginning + * of the data section, but that's not one of the conventional + * symbols, because the convention started before there was a + * read-only rodata section between text and data. */ + PROVIDE(data = .); .data : { *(.data) } - PROVIDE(edata = .); - .bss : { *(.bss) } - PROVIDE(end = .); - - /DISCARD/ : { - *(.eh_frame .note.GNU-stack) - } } diff --git a/main.c b/main.c index 9924e64..449396a 100644 --- a/main.c +++ b/main.c @@ -6,17 +6,22 @@ #include "proc.h" #include "x86.h" -static void startothers(void); -static void mpmain(void) __attribute__((noreturn)); extern pde_t *kpgdir; extern char end[]; // first address after kernel loaded from ELF file +static void main(void) __attribute__((noreturn)); +static void startothers(void); + + // Bootstrap processor starts running C code here. // Allocate a real stack and switch to it, first // doing some setup required for memory allocator to work. int -main(void) +bpmain(uint64 mbmagic, uint64 mbaddr) { + if(mbmagic != 0x2badb002) + panic("multiboot header not found"); + kinit1(end, P2V(4*1024*1024)); // phys page allocator kvmalloc(); // kernel page table mpinit(); // detect other processors @@ -30,26 +35,19 @@ main(void) tvinit(); // trap vectors binit(); // buffer cache fileinit(); // file table - ideinit(); // disk + ideinit(); // disk + startothers(); // start other processors + kinit2(P2V(4*1024*1024), P2V(PHYSTOP)); // must come after startothers() userinit(); // first user process - mpmain(); // finish this processor's setup -} - -// Other CPUs jump here from entryother.S. -static void -mpenter(void) -{ - switchkvm(); - seginit(); - lapicinit(); - mpmain(); + main(); + return 0; } // Common CPU setup code. static void -mpmain(void) +main(void) { cprintf("cpu%d: starting %d\n", cpuid(), cpuid()); idtinit(); // load idt register @@ -57,7 +55,17 @@ mpmain(void) scheduler(); // start running processes } -pde_t entrypgdir[]; // For entry.S +// Other CPUs jump here from entryother.S. +void +apmain(void) +{ + switchkvm(); + seginit(); + lapicinit(); + main(); +} + +void apstart(void); // Start the non-boot (AP) processors. static void @@ -72,7 +80,7 @@ startothers(void) // The linker has placed the image of entryother.S in // _binary_entryother_start. code = P2V(0x7000); - memmove(code, _binary_entryother_start, (uint)_binary_entryother_size); + memmove(code, _binary_entryother_start, (uint64)_binary_entryother_size); for(c = cpus; c < cpus+ncpu; c++){ if(c == mycpu()) // We've started already. @@ -82,9 +90,8 @@ startothers(void) // pgdir to use. We cannot use kpgdir yet, because the AP processor // is running in low memory, so we use entrypgdir for the APs too. stack = kalloc(); - *(void**)(code-4) = stack + KSTACKSIZE; - *(void(**)(void))(code-8) = mpenter; - *(int**)(code-12) = (void *) V2P(entrypgdir); + *(uint32*)(code-4) = V2P(apstart); + *(uint64*)(code-12) = (uint64) (stack+KSTACKSIZE); lapicstartap(c->apicid, V2P(code)); @@ -94,23 +101,3 @@ startothers(void) } } -// The boot page table used in entry.S and entryother.S. -// Page directories (and page tables) must start on page boundaries, -// hence the __aligned__ attribute. -// PTE_PS in a page directory entry enables 4Mbyte pages. - -__attribute__((__aligned__(PGSIZE))) -pde_t entrypgdir[NPDENTRIES] = { - // Map VA's [0, 4MB) to PA's [0, 4MB) - [0] = (0) | PTE_P | PTE_W | PTE_PS, - // Map VA's [KERNBASE, KERNBASE+4MB) to PA's [0, 4MB) - [KERNBASE>>PDXSHIFT] = (0) | PTE_P | PTE_W | PTE_PS, -}; - -//PAGEBREAK! -// Blank page. -//PAGEBREAK! -// Blank page. -//PAGEBREAK! -// Blank page. - diff --git a/memlayout.h b/memlayout.h index d1615f7..87818d3 100644 --- a/memlayout.h +++ b/memlayout.h @@ -2,13 +2,14 @@ #define EXTMEM 0x100000 // Start of extended memory #define PHYSTOP 0xE000000 // Top physical memory -#define DEVSPACE 0xFE000000 // Other devices are at high addresses +#define DEVSPACE 0xFE000000 // Other devices are top of 32-bit address space +#define DEVSPACETOP 0x100000000 // Key addresses for address space layout (see kmap in vm.c for layout) -#define KERNBASE 0x80000000 // First kernel virtual address +#define KERNBASE 0xFFFFFF0000000000 // First kernel virtual address #define KERNLINK (KERNBASE+EXTMEM) // Address where kernel is linked -#define V2P(a) (((uint) (a)) - KERNBASE) +#define V2P(a) (((uint64) (a)) - KERNBASE) #define P2V(a) ((void *)(((char *) (a)) + KERNBASE)) #define V2P_WO(x) ((x) - KERNBASE) // same as V2P, but without casts diff --git a/mmu.h b/mmu.h index a82d8e2..9450d90 100644 --- a/mmu.h +++ b/mmu.h @@ -2,8 +2,10 @@ // x86 memory management unit (MMU). // Eflags register +#define FL_TF 0x00000100 // Trap Flag #define FL_IF 0x00000200 // Interrupt Enable + // Control Register flags #define CR0_PE 0x00000001 // Protection Enable #define CR0_WP 0x00010000 // Write Protect @@ -11,81 +13,104 @@ #define CR4_PSE 0x00000010 // Page size extension -// various segment selectors. -#define SEG_KCODE 1 // kernel code -#define SEG_KDATA 2 // kernel data+stack -#define SEG_UCODE 3 // user code -#define SEG_UDATA 4 // user data+stack -#define SEG_TSS 5 // this process's task state +// Segment selectors (indexes) in our GDTs. +// Defined by our convention, not the architecture. +#define KCSEG32 (1<<3) /* kernel 32-bit code segment */ +#define KCSEG (2<<3) /* kernel code segment */ +#define KDSEG (3<<3) /* kernel data segment */ +#define TSSSEG (4<<3) /* tss segment - takes two slots */ +#define UDSEG (6<<3) /* user data segment */ +#define UCSEG (7<<3) /* user code segment */ -// cpu->gdt[NSEGS] holds the above segments. -#define NSEGS 6 +#define NSEGS 8 #ifndef __ASSEMBLER__ -// Segment Descriptor struct segdesc { - uint lim_15_0 : 16; // Low bits of segment limit - uint base_15_0 : 16; // Low bits of segment base address - uint base_23_16 : 8; // Middle bits of segment base address - uint type : 4; // Segment type (see STS_ constants) - uint s : 1; // 0 = system, 1 = application - uint dpl : 2; // Descriptor Privilege Level - uint p : 1; // Present - uint lim_19_16 : 4; // High bits of segment limit - uint avl : 1; // Unused (available for software use) - uint rsv1 : 1; // Reserved - uint db : 1; // 0 = 16-bit segment, 1 = 32-bit segment - uint g : 1; // Granularity: limit scaled by 4K when set - uint base_31_24 : 8; // High bits of segment base address + uint16 limit0; + uint16 base0; + uint8 base1; + uint8 bits; + uint8 bitslimit1; + uint8 base2; }; -// Normal segment -#define SEG(type, base, lim, dpl) (struct segdesc) \ -{ ((lim) >> 12) & 0xffff, (uint)(base) & 0xffff, \ - ((uint)(base) >> 16) & 0xff, type, 1, dpl, 1, \ - (uint)(lim) >> 28, 0, 0, 1, 1, (uint)(base) >> 24 } -#define SEG16(type, base, lim, dpl) (struct segdesc) \ -{ (lim) & 0xffff, (uint)(base) & 0xffff, \ - ((uint)(base) >> 16) & 0xff, type, 1, dpl, 1, \ - (uint)(lim) >> 16, 0, 0, 1, 0, (uint)(base) >> 24 } +// SEGDESC constructs a segment descriptor literal +// with the given, base, limit, and type bits. +#define SEGDESC(base, limit, bits) (struct segdesc){ \ + (limit)&0xffff, (base)&0xffff, \ + ((base)>>16)&0xff, \ + (bits)&0xff, \ + (((bits)>>4)&0xf0) | ((limit>>16)&0xf), \ + ((base)>>24)&0xff, \ +} + +// SEGDESCHI constructs an extension segment descriptor +// literal that records the high bits of base. +#define SEGDESCHI(base) (struct segdesc) { \ + (((base)>>32)&0xffff), (((base)>>48)&0xffff), \ +} + #endif #define DPL_USER 0x3 // User DPL +#define SEG_A (1<<0) /* segment accessed bit */ +#define SEG_R (1<<1) /* readable (code) */ +#define SEG_W (1<<1) /* writable (data) */ +#define SEG_C (1<<2) /* conforming segment (code) */ +#define SEG_E (1<<2) /* expand-down bit (data) */ +#define SEG_CODE (1<<3) /* code segment (instead of data) */ + +// User and system segment bits. +#define SEG_S (1<<4) /* if 0, system descriptor */ +#define SEG_DPL(x) ((x)<<5) /* descriptor privilege level (2 bits) */ +#define SEG_P (1<<7) /* segment present */ +#define SEG_AVL (1<<8) /* available for operating system use */ +#define SEG_L (1<<9) /* long mode */ +#define SEG_D (1<<10) /* default operation size 32-bit */ +#define SEG_G (1<<11) /* granularity */ + // Application segment type bits #define STA_X 0x8 // Executable segment #define STA_W 0x2 // Writeable (non-executable segments) #define STA_R 0x2 // Readable (executable segments) // System segment type bits -#define STS_T32A 0x9 // Available 32-bit TSS -#define STS_IG32 0xE // 32-bit Interrupt Gate -#define STS_TG32 0xF // 32-bit Trap Gate +#define SEG_LDT (2<<0) /* local descriptor table */ +#define SEG_TSS64A (9<<0) /* available 64-bit TSS */ +#define SEG_TSS64B (11<<0) /* busy 64-bit TSS */ +#define SEG_CALL64 (12<<0) /* 64-bit call gate */ +#define SEG_INTR64 (14<<0) /* 64-bit interrupt gate */ +#define SEG_TRAP64 (15<<0) /* 64-bit trap gate */ -// A virtual address 'la' has a three-part structure as follows: +// A virtual address 'la' has a six-part structure as follows: // -// +--------10------+-------10-------+---------12----------+ -// | Page Directory | Page Table | Offset within Page | -// | Index | Index | | -// +----------------+----------------+---------------------+ -// \--- PDX(va) --/ \--- PTX(va) --/ +// +--16--+---9---+------9-------+-----9----+----9-------+----12-------+ +// | Sign | PML4 |Page Directory| Page Dir |Page Table | Offset Page | +// |Extend| Index | Pointer Index| Index | Index | in Page | +// +------+-------+--------------+----------+------------+-------------+ +// \-PMX(va)-/\-PDPX(va)--/ \-PDX(va)-/ \-PTX(va)-/ +#define PMX(va) (((uint64)(va) >> PML4XSHIFT) & PXMASK) +#define PDPX(va) (((uint64)(va) >> PDPXSHIFT) & PXMASK) // page directory index -#define PDX(va) (((uint)(va) >> PDXSHIFT) & 0x3FF) - +#define PDX(va) (((uint64)(va) >> PDXSHIFT) & PXMASK) // page table index -#define PTX(va) (((uint)(va) >> PTXSHIFT) & 0x3FF) +#define PTX(va) (((uint64)(va) >> PTXSHIFT) & PXMASK) // construct virtual address from indexes and offset -#define PGADDR(d, t, o) ((uint)((d) << PDXSHIFT | (t) << PTXSHIFT | (o))) +#define PGADDR(d, t, o) ((uint64)((d) << PDXSHIFT | (t) << PTXSHIFT | (o))) // Page directory and page table constants. -#define NPDENTRIES 1024 // # directory entries per page directory -#define NPTENTRIES 1024 // # PTEs per page table +#define NPDENTRIES 512 // # directory entries per page directory +#define NPTENTRIES 512 // # PTEs per page table #define PGSIZE 4096 // bytes mapped by a page #define PTXSHIFT 12 // offset of PTX in a linear address -#define PDXSHIFT 22 // offset of PDX in a linear address +#define PDXSHIFT 21 // offset of PDX in a linear address +#define PDPXSHIFT 30 // offset of PDPX in a linear address +#define PML4XSHIFT 39 // offset of PML4X in a linear address +#define PXMASK 0X1FF #define PGROUNDUP(sz) (((sz)+PGSIZE-1) & ~(PGSIZE-1)) #define PGROUNDDOWN(a) (((a)) & ~(PGSIZE-1)) @@ -95,87 +120,54 @@ struct segdesc { #define PTE_W 0x002 // Writeable #define PTE_U 0x004 // User #define PTE_PS 0x080 // Page Size +#define PTE_PWT 0x008 // Write-Through +#define PTE_PCD 0x010 // Cache-Disable // Address in page table or page directory entry -#define PTE_ADDR(pte) ((uint)(pte) & ~0xFFF) -#define PTE_FLAGS(pte) ((uint)(pte) & 0xFFF) +#define PTE_ADDR(pte) ((uint64)(pte) & ~0xFFF) +#define PTE_FLAGS(pte) ((uint64)(pte) & 0xFFF) #ifndef __ASSEMBLER__ -typedef uint pte_t; -// Task state segment format +typedef uint64 pml4e_t; +typedef uint64 pdpe_t; +typedef uint64 pte_t; + struct taskstate { - uint link; // Old ts selector - uint esp0; // Stack pointers and segment selectors - ushort ss0; // after an increase in privilege level - ushort padding1; - uint *esp1; - ushort ss1; - ushort padding2; - uint *esp2; - ushort ss2; - ushort padding3; - void *cr3; // Page directory base - uint *eip; // Saved state from last task switch - uint eflags; - uint eax; // More saved state (registers) - uint ecx; - uint edx; - uint ebx; - uint *esp; - uint *ebp; - uint esi; - uint edi; - ushort es; // Even more saved state (segment selectors) - ushort padding4; - ushort cs; - ushort padding5; - ushort ss; - ushort padding6; - ushort ds; - ushort padding7; - ushort fs; - ushort padding8; - ushort gs; - ushort padding9; - ushort ldt; - ushort padding10; - ushort t; // Trap on task switch - ushort iomb; // I/O map base address + uint8 reserved0[4]; + uint64 rsp[3]; + uint64 ist[8]; + uint8 reserved1[10]; + uint16 iomba; + uint8 iopb[0]; +} __attribute__ ((packed)); + +#define INT_P (1<<7) /* interrupt descriptor present */ + +struct intgate +{ + uint16 rip0; + uint16 cs; + uint8 reserved0; + uint8 bits; + uint16 rip1; + uint32 rip2; + uint32 reserved1; }; -// Gate descriptors for interrupts and traps -struct gatedesc { - uint off_15_0 : 16; // low 16 bits of offset in segment - uint cs : 16; // code segment selector - uint args : 5; // # args, 0 for interrupt/trap gates - uint rsv1 : 3; // reserved(should be zero I guess) - uint type : 4; // type(STS_{IG32,TG32}) - uint s : 1; // must be 0 (system) - uint dpl : 2; // descriptor(meaning new) privilege level - uint p : 1; // Present - uint off_31_16 : 16; // high bits of offset in segment -}; - -// Set up a normal interrupt/trap gate descriptor. -// - istrap: 1 for a trap (= exception) gate, 0 for an interrupt gate. -// interrupt gate clears FL_IF, trap gate leaves FL_IF alone -// - sel: Code segment selector for interrupt/trap handler -// - off: Offset in code segment for interrupt/trap handler -// - dpl: Descriptor Privilege Level - -// the privilege level required for software to invoke -// this interrupt/trap gate explicitly using an int instruction. -#define SETGATE(gate, istrap, sel, off, d) \ -{ \ - (gate).off_15_0 = (uint)(off) & 0xffff; \ - (gate).cs = (sel); \ - (gate).args = 0; \ - (gate).rsv1 = 0; \ - (gate).type = (istrap) ? STS_TG32 : STS_IG32; \ - (gate).s = 0; \ - (gate).dpl = (d); \ - (gate).p = 1; \ - (gate).off_31_16 = (uint)(off) >> 16; \ +// INTDESC constructs an interrupt descriptor literal +// that records the given code segment, instruction pointer, +// and type bits. +#define INTDESC(cs, rip, bits) (struct intgate){ \ + (rip)&0xffff, (cs), 0, bits, ((rip)>>16)&0xffff, \ + (uint64)(rip)>>32, 0, \ } +// See section 4.6 of amd64 vol2 +struct desctr +{ + uint16 limit; + uint64 base; +} __attribute__((packed, aligned(16))); // important! + #endif diff --git a/mp.c b/mp.c index 79bb0ad..e36e45c 100644 --- a/mp.c +++ b/mp.c @@ -28,7 +28,7 @@ sum(uchar *addr, int len) // Look for an MP structure in the len bytes at addr. static struct mp* -mpsearch1(uint a, int len) +mpsearch1(uint64 a, int len) { uchar *e, *p, *addr; @@ -77,7 +77,7 @@ mpconfig(struct mp **pmp) if((mp = mpsearch()) == 0 || mp->physaddr == 0) return 0; - conf = (struct mpconf*) P2V((uint) mp->physaddr); + conf = (struct mpconf*) P2V((uint64) mp->physaddr); if(memcmp(conf, "PCMP", 4) != 0) return 0; if(conf->version != 1 && conf->version != 4) @@ -101,7 +101,7 @@ mpinit(void) if((conf = mpconfig(&mp)) == 0) panic("Expect to run on an SMP"); ismp = 1; - lapic = (uint*)conf->lapicaddr; + lapic = P2V((uint64)conf->lapicaddr_p); for(p=(uchar*)(conf+1), e=(uchar*)conf+conf->length; p> 32; + __asm volatile("wrmsr" : : "c" (msr), "a" (lo), "d" (hi) : "memory"); +} diff --git a/printf.c b/printf.c index b3298aa..c820305 100644 --- a/printf.c +++ b/printf.c @@ -2,6 +2,10 @@ #include "stat.h" #include "user.h" +#include + +static char digits[] = "0123456789ABCDEF"; + static void putc(int fd, char c) { @@ -11,7 +15,6 @@ putc(int fd, char c) static void printint(int fd, int xx, int base, int sgn) { - static char digits[] = "0123456789ABCDEF"; char buf[16]; int i, neg; uint x; @@ -35,16 +38,25 @@ printint(int fd, int xx, int base, int sgn) putc(fd, buf[i]); } +static void +printptr(int fd, uint64 x) { + int i; + putc(fd, '0'); + putc(fd, 'x'); + for (i = 0; i < (sizeof(uint64) * 2); i++, x <<= 4) + putc(fd, digits[x >> (sizeof(uint64) * 8 - 4)]); +} + // Print to the given fd. Only understands %d, %x, %p, %s. void printf(int fd, const char *fmt, ...) { + va_list ap; char *s; int c, i, state; - uint *ap; + va_start(ap, fmt); state = 0; - ap = (uint*)(void*)&fmt + 1; for(i = 0; fmt[i]; i++){ c = fmt[i] & 0xff; if(state == 0){ @@ -55,14 +67,13 @@ printf(int fd, const char *fmt, ...) } } else if(state == '%'){ if(c == 'd'){ - printint(fd, *ap, 10, 1); - ap++; - } else if(c == 'x' || c == 'p'){ - printint(fd, *ap, 16, 0); - ap++; + printint(fd, va_arg(ap, int), 10, 1); + } else if(c == 'x') { + printint(fd, va_arg(ap, int), 16, 0); + } else if(c == 'p') { + printptr(fd, va_arg(ap, uint64)); } else if(c == 's'){ - s = (char*)*ap; - ap++; + s = va_arg(ap, char*); if(s == 0) s = "(null)"; while(*s != 0){ @@ -70,8 +81,7 @@ printf(int fd, const char *fmt, ...) s++; } } else if(c == 'c'){ - putc(fd, *ap); - ap++; + putc(fd, va_arg(ap, uint)); } else if(c == '%'){ putc(fd, c); } else { diff --git a/proc.c b/proc.c index 806b1b1..58ae948 100644 --- a/proc.c +++ b/proc.c @@ -6,6 +6,7 @@ #include "x86.h" #include "proc.h" #include "spinlock.h" +#include "msr.h" struct { struct spinlock lock; @@ -16,7 +17,7 @@ static struct proc *initproc; int nextpid = 1; extern void forkret(void); -extern void trapret(void); +extern void sysexit(void); static void wakeup1(void *chan); @@ -104,13 +105,13 @@ found: // Set up new context to start executing at forkret, // which returns to trapret. - sp -= 4; - *(uint*)sp = (uint)trapret; + sp -= sizeof(uint64); + *(uint64*)sp = (uint64)sysexit; sp -= sizeof *p->context; p->context = (struct context*)sp; memset(p->context, 0, sizeof *p->context); - p->context->eip = (uint)forkret; + p->context->eip = (uint64)forkret; return p; } @@ -128,16 +129,12 @@ userinit(void) initproc = p; if((p->pgdir = setupkvm()) == 0) panic("userinit: out of memory?"); - inituvm(p->pgdir, _binary_initcode_start, (int)_binary_initcode_size); + inituvm(p->pgdir, _binary_initcode_start, (uint64)_binary_initcode_size); p->sz = PGSIZE; memset(p->tf, 0, sizeof(*p->tf)); - p->tf->cs = (SEG_UCODE << 3) | DPL_USER; - p->tf->ds = (SEG_UDATA << 3) | DPL_USER; - p->tf->es = p->tf->ds; - p->tf->ss = p->tf->ds; - p->tf->eflags = FL_IF; - p->tf->esp = PGSIZE; - p->tf->eip = 0; // beginning of initcode.S + p->tf->r11 = FL_IF; + p->tf->rsp = PGSIZE; + p->tf->rcx = 0; // beginning of initcode.S safestrcpy(p->name, "initcode", sizeof(p->name)); p->cwd = namei("/"); @@ -201,7 +198,7 @@ fork(void) *np->tf = *curproc->tf; // Clear %eax so that fork returns 0 in the child. - np->tf->eax = 0; + np->tf->rax = 0; for(i = 0; i < NOFILE; i++) if(curproc->ofile[i]) @@ -289,8 +286,8 @@ wait(void) pid = p->pid; kfree(p->kstack); p->kstack = 0; - freevm(p->pgdir); - p->pid = 0; + freevm(p->pgdir, p->sz); + p->pid = 0; p->parent = 0; p->name[0] = 0; p->killed = 0; @@ -339,6 +336,7 @@ scheduler(void) // Switch to chosen process. It is the process's job // to release ptable.lock and then reacquire it // before jumping back to us. + c->proc = p; switchuvm(p); p->state = RUNNING; @@ -408,7 +406,7 @@ forkret(void) iinit(ROOTDEV); initlog(ROOTDEV); } - + // Return to "caller", actually trapret (see allocproc). } @@ -514,7 +512,7 @@ procdump(void) int i; struct proc *p; char *state; - uint pc[10]; + uint64 pc[10]; for(p = ptable.proc; p < &ptable.proc[NPROC]; p++){ if(p->state == UNUSED) @@ -525,7 +523,7 @@ procdump(void) state = "???"; cprintf("%d %s %s", p->pid, state, p->name); if(p->state == SLEEPING){ - getcallerpcs((uint*)p->context->ebp+2, pc); + getcallerpcs((uint64*)p->context->ebp+2, pc); for(i=0; i<10 && pc[i] != 0; i++) cprintf(" %p", pc[i]); } diff --git a/proc.h b/proc.h index 1647114..5ab2de5 100644 --- a/proc.h +++ b/proc.h @@ -1,5 +1,8 @@ // Per-CPU state struct cpu { + uint64 syscallno; // Temporary used by sysentry + uint64 usp; // Temporary used by sysentry + struct proc *proc; // The process running on this cpu or null uchar apicid; // Local APIC ID struct context *scheduler; // swtch() here to enter scheduler struct taskstate ts; // Used by x86 to find stack for interrupt @@ -7,7 +10,6 @@ struct cpu { volatile uint started; // Has the CPU started? int ncli; // Depth of pushcli nesting. int intena; // Were interrupts enabled before pushcli? - struct proc *proc; // The process running on this cpu or null }; extern struct cpu cpus[NCPU]; @@ -25,20 +27,23 @@ extern int ncpu; // at the "Switch stacks" comment. Switch doesn't save eip explicitly, // but it is on the stack and allocproc() manipulates it. struct context { - uint edi; - uint esi; - uint ebx; - uint ebp; - uint eip; + uint64 r15; + uint64 r14; + uint64 r13; + uint64 r12; + uint64 r11; + uint64 rbx; + uint64 ebp; //rbp + uint64 eip; //rip; }; enum procstate { UNUSED, EMBRYO, SLEEPING, RUNNABLE, RUNNING, ZOMBIE }; // Per-process state struct proc { - uint sz; // Size of process memory (bytes) + char *kstack; // Bottom of kernel stack for this process, must be first entry + uint64 sz; // Size of process memory (bytes) pde_t* pgdir; // Page table - char *kstack; // Bottom of kernel stack for this process enum procstate state; // Process state int pid; // Process ID struct proc *parent; // Parent process diff --git a/spinlock.c b/spinlock.c index 4020186..9ee65f6 100644 --- a/spinlock.c +++ b/spinlock.c @@ -69,17 +69,17 @@ release(struct spinlock *lk) // Record the current call stack in pcs[] by following the %ebp chain. void -getcallerpcs(void *v, uint pcs[]) +getcallerpcs(void *v, uint64 pcs[]) { - uint *ebp; + uint64 *ebp; int i; - ebp = (uint*)v - 2; + asm volatile("mov %%rbp, %0" : "=r" (ebp)); for(i = 0; i < 10; i++){ - if(ebp == 0 || ebp < (uint*)KERNBASE || ebp == (uint*)0xffffffff) + if(ebp == 0 || ebp < (uint64*)KERNBASE || ebp == (uint64*)0xffffffff) break; pcs[i] = ebp[1]; // saved %eip - ebp = (uint*)ebp[0]; // saved %ebp + ebp = (uint64*)ebp[0]; // saved %ebp } for(; i < 10; i++) pcs[i] = 0; diff --git a/spinlock.h b/spinlock.h index 0a9d8e2..90bffdb 100644 --- a/spinlock.h +++ b/spinlock.h @@ -5,7 +5,7 @@ struct spinlock { // For debugging: char *name; // Name of lock. struct cpu *cpu; // The cpu holding the lock. - uint pcs[10]; // The call stack (an array of program counters) + uint64 pcs[10]; // The call stack (an array of program counters) // that locked the lock. }; diff --git a/string.c b/string.c index a7cc61f..861ea25 100644 --- a/string.c +++ b/string.c @@ -4,7 +4,7 @@ void* memset(void *dst, int c, uint n) { - if ((int)dst%4 == 0 && n%4 == 0){ + if ((uint64)dst%4 == 0 && n%4 == 0){ c &= 0xFF; stosl(dst, (c<<24)|(c<<16)|(c<<8)|c, n/4); } else diff --git a/swtch.S b/swtch.S index 63a7dcc..de2e79d 100644 --- a/swtch.S +++ b/swtch.S @@ -8,22 +8,28 @@ .globl swtch swtch: - movl 4(%esp), %eax - movl 8(%esp), %edx - - # Save old callee-saved registers - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi + # Save old callee-save registers + push %rbp + push %rbx + push %r11 + push %r12 + push %r13 + push %r14 + push %r15 # Switch stacks - movl %esp, (%eax) - movl %edx, %esp + mov %rsp, (%rdi) # first arg is in rdi + mov %rsi, %rsp # second arg is in rsi + + # Load new callee-save registers + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %r11 + pop %rbx + pop %rbp - # Load new callee-saved registers - popl %edi - popl %esi - popl %ebx - popl %ebp ret + + diff --git a/syscall.c b/syscall.c index ee85261..3ffe3d8 100644 --- a/syscall.c +++ b/syscall.c @@ -15,13 +15,13 @@ // Fetch the int at addr from the current process. int -fetchint(uint addr, int *ip) +fetchint(uint64 addr, int *ip) { struct proc *curproc = myproc(); if(addr >= curproc->sz || addr+4 > curproc->sz) return -1; - *ip = *(int*)(addr); + *ip = *(uint64*)(addr); return 0; } @@ -29,7 +29,7 @@ fetchint(uint addr, int *ip) // Doesn't actually copy the string - just sets *pp to point at it. // Returns length of string, not including nul. int -fetchstr(uint addr, char **pp) +fetchstr(uint64 addr, char **pp) { char *s, *ep; struct proc *curproc = myproc(); @@ -45,11 +45,51 @@ fetchstr(uint addr, char **pp) return -1; } +static uint64 +fetcharg(int n) +{ + struct proc *curproc = myproc(); + switch (n) { + case 0: + return curproc->tf->rdi; + case 1: + return curproc->tf->rsi; + case 2: + return curproc->tf->rdx; + case 3: + return curproc->tf->r10; + case 4: + return curproc->tf->r8; + case 5: + return curproc->tf->r9; + } + panic("fetcharg"); + return -1; +} + +int +fetchaddr(uint64 addr, uint64 *ip) +{ + struct proc *curproc = myproc(); + if(addr >= curproc->sz || addr+sizeof(uint64) > curproc->sz) + return -1; + *ip = *(uint64*)(addr); + return 0; +} + // Fetch the nth 32-bit system call argument. int argint(int n, int *ip) { - return fetchint((myproc()->tf->esp) + 4 + 4*n, ip); + *ip = fetcharg(n); + return 0; +} + +int +argaddr(int n, uint64 *ip) +{ + *ip = fetcharg(n); + return 0; } // Fetch the nth word-sized system call argument as a pointer @@ -58,10 +98,10 @@ argint(int n, int *ip) int argptr(int n, char **pp, int size) { - int i; + uint64 i; struct proc *curproc = myproc(); - if(argint(n, &i) < 0) + if(argaddr(n, &i) < 0) return -1; if(size < 0 || (uint)i >= curproc->sz || (uint)i+size > curproc->sz) return -1; @@ -134,12 +174,12 @@ syscall(void) int num; struct proc *curproc = myproc(); - num = curproc->tf->eax; + num = curproc->tf->rax; if(num > 0 && num < NELEM(syscalls) && syscalls[num]) { - curproc->tf->eax = syscalls[num](); + curproc->tf->rax = syscalls[num](); } else { cprintf("%d %s: unknown sys call %d\n", curproc->pid, curproc->name, num); - curproc->tf->eax = -1; + curproc->tf->rax = -1; } } diff --git a/sysfile.c b/sysfile.c index 87e508b..d0de779 100644 --- a/sysfile.c +++ b/sysfile.c @@ -399,16 +399,16 @@ sys_exec(void) { char *path, *argv[MAXARG]; int i; - uint uargv, uarg; + uint64 uargv, uarg; - if(argstr(0, &path) < 0 || argint(1, (int*)&uargv) < 0){ + if(argstr(0, &path) < 0 || argaddr(1, &uargv) < 0){ return -1; } memset(argv, 0, sizeof(argv)); for(i=0;; i++){ if(i >= NELEM(argv)) return -1; - if(fetchint(uargv+4*i, (int*)&uarg) < 0) + if(fetchaddr(uargv+sizeof(uint64)*i, (uint64*)&uarg) < 0) return -1; if(uarg == 0){ argv[i] = 0; diff --git a/trap.c b/trap.c index 41c66eb..f27b99b 100644 --- a/trap.c +++ b/trap.c @@ -9,8 +9,8 @@ #include "spinlock.h" // Interrupt descriptor table (shared by all CPUs). -struct gatedesc idt[256]; -extern uint vectors[]; // in vectors.S: array of 256 entry pointers +struct intgate idt[256]; +extern uint64 vectors[]; // in vectors.S: array of 256 entry pointers struct spinlock tickslock; uint ticks; @@ -19,17 +19,22 @@ tvinit(void) { int i; - for(i = 0; i < 256; i++) - SETGATE(idt[i], 0, SEG_KCODE<<3, vectors[i], 0); - SETGATE(idt[T_SYSCALL], 1, SEG_KCODE<<3, vectors[T_SYSCALL], DPL_USER); - + for(i=0; i<256; i++) { + idt[i] = INTDESC(KCSEG, vectors[i], INT_P | SEG_INTR64); + } + idtinit(); + initlock(&tickslock, "time"); } void idtinit(void) { - lidt(idt, sizeof(idt)); + struct desctr dtr; + + dtr.limit = sizeof(idt) - 1; + dtr.base = (uint64)idt; + lidt((void *)&dtr.limit); } //PAGEBREAK: 41 @@ -74,7 +79,7 @@ trap(struct trapframe *tf) case T_IRQ0 + 7: case T_IRQ0 + IRQ_SPURIOUS: cprintf("cpu%d: spurious interrupt at %x:%x\n", - cpuid(), tf->cs, tf->eip); + cpuid(), tf->cs, tf->rip); lapiceoi(); break; @@ -83,14 +88,14 @@ trap(struct trapframe *tf) if(myproc() == 0 || (tf->cs&3) == 0){ // In kernel, it must be our mistake. cprintf("unexpected trap %d from cpu %d eip %x (cr2=0x%x)\n", - tf->trapno, cpuid(), tf->eip, rcr2()); + tf->trapno, cpuid(), tf->rip, rcr2()); panic("trap"); } // In user space, assume process misbehaved. cprintf("pid %d %s: trap %d err %d on cpu %d " "eip 0x%x addr 0x%x--kill proc\n", myproc()->pid, myproc()->name, tf->trapno, - tf->err, cpuid(), tf->eip, rcr2()); + tf->err, cpuid(), tf->rip, rcr2()); myproc()->killed = 1; } @@ -105,8 +110,10 @@ trap(struct trapframe *tf) if(myproc() && myproc()->state == RUNNING && tf->trapno == T_IRQ0+IRQ_TIMER) yield(); - + // Check if the process has been killed since we yielded if(myproc() && myproc()->killed && (tf->cs&3) == DPL_USER) exit(); } + + diff --git a/trapasm.S b/trapasm.S index da8aefc..b6dbb1a 100644 --- a/trapasm.S +++ b/trapasm.S @@ -1,32 +1,136 @@ +#include "param.h" +#include "x86.h" #include "mmu.h" - - # vectors.S sends all traps here. + +# vectors.S sends all traps here. .globl alltraps alltraps: # Build trap frame. - pushl %ds - pushl %es - pushl %fs - pushl %gs - pushal + push %r15 + push %r14 + push %r13 + push %r12 + push %r11 + push %r10 + push %r9 + push %r8 + push %rdi + push %rsi + push %rbp + push %rdx + push %rcx + push %rbx + push %rax + + cmpw $KCSEG, 32(%rsp) # compare to saved cs + jz 1f + swapgs - # Set up data segments. - movw $(SEG_KDATA<<3), %ax - movw %ax, %ds - movw %ax, %es - - # Call trap(tf), where tf=%esp - pushl %esp +1:mov %rsp, %rdi # frame in arg1 call trap - addl $4, %esp - # Return falls through to trapret... +# Return falls through to trapret... .globl trapret trapret: - popal - popl %gs - popl %fs - popl %es - popl %ds - addl $0x8, %esp # trapno and errcode - iret + cli + cmpw $KCSEG, 32(%rsp) # compare to saved cs + jz 1f + swapgs + +1:pop %rax + pop %rbx + pop %rcx + pop %rdx + pop %rbp + pop %rsi + pop %rdi + pop %r8 + pop %r9 + pop %r10 + pop %r11 + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + add $16, %rsp # discard trapnum and errorcode + iretq +#PAGEBREAK! + +# syscall_entry jumps here after syscall instruction +.globl sysentry +sysentry: # Build trap frame. + // load kernel stack address + swapgs + movq %rax, %gs:0 // save %rax in syscallno of cpu entry + movq %rsp, %gs:8 // user sp + movq %gs:16, %rax // proc entry + + movq %ss:0(%rax), %rax // load kstack from proc + addq $(KSTACKSIZE), %rax + + movq %rax, %rsp + movq %gs:0, %rax // restore rax + + // push usp + push $0 + push %gs:8 + // safe eflags and eip + push %r11 + push $UCSEG + push %rcx + // push errno and trapno to make stack look like a trap + push $0 + push $64 + + // push values on kernel stack + push %r15 + push %r14 + push %r13 + push %r12 + push %r11 + push %r10 + push %r9 + push %r8 + push %rdi + push %rsi + push %rbp + push %rdx + push %rcx + push %rbx + push %rax + + mov %rsp, %rdi # frame in arg1 + + call trap +#PAGEBREAK! + +# Return falls through to trapret... +.globl sysexit +sysexit: + # to make sure we don't get any interrupts on the user stack while in + # supervisor mode. insufficient? (see vunerability reports for sysret) + cli + + pop %rax + pop %rbx + pop %rcx + pop %rdx + pop %rbp + pop %rsi + pop %rdi + pop %r8 + pop %r9 + pop %r10 + pop %r11 + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + add $(5*8), %rsp # discard trapnum, errorcode, rip, cs and rflags + mov (%rsp),%rsp # switch to the user stack + swapgs + + sysretq + diff --git a/traps.h b/traps.h index 0bd1fd8..cb6f8a7 100644 --- a/traps.h +++ b/traps.h @@ -36,3 +36,4 @@ #define IRQ_ERROR 19 #define IRQ_SPURIOUS 31 + diff --git a/types.h b/types.h index e4adf64..ee73164 100644 --- a/types.h +++ b/types.h @@ -1,4 +1,10 @@ typedef unsigned int uint; typedef unsigned short ushort; typedef unsigned char uchar; -typedef uint pde_t; + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int uint32; +typedef unsigned long uint64; + +typedef uint64 pde_t; diff --git a/usertests.c b/usertests.c index a1e97e7..07d10d4 100644 --- a/usertests.c +++ b/usertests.c @@ -363,17 +363,29 @@ preempt(void) printf(1, "preempt: "); pid1 = fork(); + if(pid1 < 0) { + printf(1, "fork failed"); + exit(); + } if(pid1 == 0) for(;;) ; pid2 = fork(); + if(pid2 < 0) { + printf(1, "fork failed\n"); + exit(); + } if(pid2 == 0) for(;;) ; pipe(pfds); pid3 = fork(); + if(pid3 < 0) { + printf(1, "fork failed\n"); + exit(); + } if(pid3 == 0){ close(pfds[0]); if(write(pfds[1], "x", 1) != 1) @@ -1391,6 +1403,11 @@ forktest(void) exit(); } + if (n == 0) { + printf(1, "no fork at all!\n"); + exit(); + } + if(n == 1000){ printf(1, "fork claimed to work 1000 times!\n"); exit(); @@ -1414,16 +1431,16 @@ forktest(void) void sbrktest(void) { - int fds[2], pid, pids[10], ppid; - char *a, *b, *c, *lastaddr, *oldbrk, *p, scratch; - uint amt; + int i, fds[2], pids[10], pid, ppid; + char *c, *oldbrk, scratch, *a, *b, *lastaddr, *p; + uint64 amt; + #define BIG (100*1024*1024) printf(stdout, "sbrk test\n"); oldbrk = sbrk(0); // can one sbrk() less than a page? a = sbrk(0); - int i; for(i = 0; i < 5000; i++){ b = sbrk(1); if(b != a){ @@ -1449,9 +1466,8 @@ sbrktest(void) wait(); // can one grow address space to something big? -#define BIG (100*1024*1024) a = sbrk(0); - amt = (BIG) - (uint)a; + amt = (BIG) - (uint64)a; p = sbrk(amt); if (p != a) { printf(stdout, "sbrk test failed to grow big address space; enough phys mem?\n"); @@ -1508,7 +1524,7 @@ sbrktest(void) } wait(); } - + // if we run the system out of memory, does it clean up the last // failed allocation? if(pipe(fds) != 0){ @@ -1518,7 +1534,7 @@ sbrktest(void) for(i = 0; i < sizeof(pids)/sizeof(pids[0]); i++){ if((pids[i] = fork()) == 0){ // allocate a lot of memory - sbrk(BIG - (uint)sbrk(0)); + sbrk(BIG - (uint64)sbrk(0)); write(fds[1], "x", 1); // sit around until killed for(;;) sleep(1000); @@ -1526,6 +1542,7 @@ sbrktest(void) if(pids[i] != -1) read(fds[0], &scratch, 1); } + // if those failed allocations freed up the pages they did allocate, // we'll be able to allocate here c = sbrk(4096); @@ -1549,7 +1566,7 @@ sbrktest(void) void validateint(int *p) { - int res; + /* XXX int res; asm("mov %%esp, %%ebx\n\t" "mov %3, %%esp\n\t" "int %2\n\t" @@ -1557,13 +1574,14 @@ validateint(int *p) "=a" (res) : "a" (SYS_sleep), "n" (T_SYSCALL), "c" (p) : "ebx"); + */ } void validatetest(void) { int hi, pid; - uint p; + uint64 p; printf(stdout, "validate test\n"); hi = 1100*1024; diff --git a/usys.S b/usys.S index 8bfd8a1..e62f3d9 100644 --- a/usys.S +++ b/usys.S @@ -5,7 +5,7 @@ .globl name; \ name: \ movl $SYS_ ## name, %eax; \ - int $T_SYSCALL; \ + syscall; \ ret SYSCALL(fork) diff --git a/vectors.pl b/vectors.pl index 57b49dd..d746d6b 100755 --- a/vectors.pl +++ b/vectors.pl @@ -12,9 +12,9 @@ for(my $i = 0; $i < 256; $i++){ print ".globl vector$i\n"; print "vector$i:\n"; if(!($i == 8 || ($i >= 10 && $i <= 14) || $i == 17)){ - print " pushl \$0\n"; + print " push \$0\n"; } - print " pushl \$$i\n"; + print " push \$$i\n"; print " jmp alltraps\n"; } @@ -23,7 +23,7 @@ print ".data\n"; print ".globl vectors\n"; print "vectors:\n"; for(my $i = 0; $i < 256; $i++){ - print " .long vector$i\n"; + print " .quad vector$i\n"; } # sample output: @@ -31,8 +31,8 @@ for(my $i = 0; $i < 256; $i++){ # .globl alltraps # .globl vector0 # vector0: -# pushl $0 -# pushl $0 +# push $0 +# push $0 # jmp alltraps # ... # @@ -40,8 +40,8 @@ for(my $i = 0; $i < 256; $i++){ # .data # .globl vectors # vectors: -# .long vector0 -# .long vector1 -# .long vector2 +# .quad vector0 +# .quad vector1 +# .quad vector2 # ... diff --git a/vm.c b/vm.c index 7134cff..fb0cc33 100644 --- a/vm.c +++ b/vm.c @@ -2,13 +2,34 @@ #include "types.h" #include "defs.h" #include "x86.h" +#include "msr.h" #include "memlayout.h" #include "mmu.h" #include "proc.h" #include "elf.h" +#include "traps.h" extern char data[]; // defined by kernel.ld -pde_t *kpgdir; // for use in scheduler() +void sysentry(void); + +static pde_t *kpml4; // kernel address space, used by scheduler and bootup + +// Bootstrap GDT. Used by boot.S but defined in C +// Map "logical" addresses to virtual addresses using identity map. +// Cannot share a CODE descriptor for both kernel and user +// because it would have to have DPL_USR, but the CPU forbids +// an interrupt from CPL=0 to DPL=3. +struct segdesc bootgdt[NSEGS] = { + [0] = SEGDESC(0, 0, 0), // null + [1] = SEGDESC(0, 0xfffff, SEG_R|SEG_CODE|SEG_S|SEG_DPL(0)|SEG_P|SEG_D|SEG_G), // 32-bit kernel code + [2] = SEGDESC(0, 0, SEG_R|SEG_CODE|SEG_S|SEG_DPL(0)|SEG_P|SEG_L|SEG_G), // 64-bit kernel code + [3] = SEGDESC(0, 0xfffff, SEG_W|SEG_S|SEG_DPL(0)|SEG_P|SEG_D|SEG_G), // kernel data + // The order of the user data and user code segments is + // important for syscall instructions. See initseg. + [6] = SEGDESC(0, 0xfffff, SEG_W|SEG_S|SEG_DPL(3)|SEG_P|SEG_D|SEG_G), // 64-bit user data + [7] = SEGDESC(0, 0, SEG_R|SEG_CODE|SEG_S|SEG_DPL(3)|SEG_P|SEG_L|SEG_G), // 64-bit user code +}; + // Set up CPU's kernel segment descriptors. // Run once on entry on each CPU. @@ -16,41 +37,82 @@ void seginit(void) { struct cpu *c; + struct desctr dtr; - // Map "logical" addresses to virtual addresses using identity map. - // Cannot share a CODE descriptor for both kernel and user - // because it would have to have DPL_USR, but the CPU forbids - // an interrupt from CPL=0 to DPL=3. - c = &cpus[cpuid()]; - c->gdt[SEG_KCODE] = SEG(STA_X|STA_R, 0, 0xffffffff, 0); - c->gdt[SEG_KDATA] = SEG(STA_W, 0, 0xffffffff, 0); - c->gdt[SEG_UCODE] = SEG(STA_X|STA_R, 0, 0xffffffff, DPL_USER); - c->gdt[SEG_UDATA] = SEG(STA_W, 0, 0xffffffff, DPL_USER); - lgdt(c->gdt, sizeof(c->gdt)); + c = mycpu(); + memmove(c->gdt, bootgdt, sizeof bootgdt); + dtr.limit = sizeof(c->gdt)-1; + dtr.base = (uint64) c->gdt; + lgdt((void *)&dtr.limit); + + // When executing a syscall instruction the CPU sets the SS selector + // to (star >> 32) + 8 and the CS selector to (star >> 32). + // When executing a sysret instruction the CPU sets the SS selector + // to (star >> 48) + 8 and the CS selector to (star >> 48) + 16. + uint64 star = ((((uint64)UCSEG|0x3)- 16)<<48)|((uint64)(KCSEG)<<32); + writemsr(MSR_STAR, star); + writemsr(MSR_LSTAR, (uint64)&sysentry); + writemsr(MSR_SFMASK, FL_TF | FL_IF); + + // Initialize cpu-local storage. + writegs(KDSEG); + writemsr(MSR_GS_BASE, (uint64)c); + writemsr(MSR_GS_KERNBASE, (uint64)c); } // Return the address of the PTE in page table pgdir // that corresponds to virtual address va. If alloc!=0, // create any required page table pages. static pte_t * -walkpgdir(pde_t *pgdir, const void *va, int alloc) +walkpgdir(pde_t *pml4, const void *va, int alloc) { + pml4e_t *pml4e; + pdpe_t *pdp; + pdpe_t *pdpe; pde_t *pde; + pde_t *pd; pte_t *pgtab; - pde = &pgdir[PDX(va)]; - if(*pde & PTE_P){ - pgtab = (pte_t*)P2V(PTE_ADDR(*pde)); - } else { - if(!alloc || (pgtab = (pte_t*)kalloc()) == 0) + // level 4 + pml4e = &pml4[PMX(va)]; + if(*pml4e & PTE_P) + pdp = (pdpe_t*)P2V(PTE_ADDR(*pml4e)); + else { + if(!alloc || (pdp = (pdpe_t*)kalloc()) == 0) return 0; // Make sure all those PTE_P bits are zero. - memset(pgtab, 0, PGSIZE); + memset(pdp, 0, PGSIZE); // The permissions here are overly generous, but they can // be further restricted by the permissions in the page table // entries, if necessary. + *pml4e = V2P(pdp) | PTE_P | PTE_W | PTE_U; + } + + // XXX avoid repetition + + // level 3 + pdpe = &pdp[PDPX(va)]; + if(*pdpe & PTE_P) + pd = (pde_t*)P2V(PTE_ADDR(*pdpe)); + else { + if(!alloc || (pd = (pde_t*)kalloc()) == 0) + return 0; + memset(pd, 0, PGSIZE); + *pdpe = V2P(pd) | PTE_P | PTE_W | PTE_U; + } + + // level 2 + pde = &pd[PDX(va)]; + if(*pde & PTE_P) + pgtab = (pte_t*)P2V(PTE_ADDR(*pde)); + else { + if(!alloc || (pgtab = (pte_t*)kalloc()) == 0) + return 0; + memset(pgtab, 0, PGSIZE); *pde = V2P(pgtab) | PTE_P | PTE_W | PTE_U; } + + // level 1 return &pgtab[PTX(va)]; } @@ -58,13 +120,13 @@ walkpgdir(pde_t *pgdir, const void *va, int alloc) // physical addresses starting at pa. va and size might not // be page-aligned. static int -mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm) +mappages(pde_t *pgdir, void *va, uint64 size, uint64 pa, int perm) { char *a, *last; pte_t *pte; - a = (char*)PGROUNDDOWN((uint)va); - last = (char*)PGROUNDDOWN(((uint)va) + size - 1); + a = (char*)PGROUNDDOWN((uint64)va); + last = (char*)PGROUNDDOWN(((uint64)va) + size - 1); for(;;){ if((pte = walkpgdir(pgdir, a, 1)) == 0) return -1; @@ -80,7 +142,7 @@ mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm) } // There is one page table per process, plus one that's used when -// a CPU is not running any process (kpgdir). The kernel uses the +// a CPU is not running any process (kpml4). The kernel uses the // current process's page table during system calls and interrupts; // page protection bits prevent user code from using the kernel's // mappings. @@ -104,35 +166,36 @@ mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm) // every process's page table. static struct kmap { void *virt; - uint phys_start; - uint phys_end; + uint64 phys_start; + uint64 phys_end; int perm; } kmap[] = { { (void*)KERNBASE, 0, EXTMEM, PTE_W}, // I/O space { (void*)KERNLINK, V2P(KERNLINK), V2P(data), 0}, // kern text+rodata { (void*)data, V2P(data), PHYSTOP, PTE_W}, // kern data+memory - { (void*)DEVSPACE, DEVSPACE, 0, PTE_W}, // more devices + { (void*)P2V(DEVSPACE), DEVSPACE, DEVSPACETOP, PTE_W}, // more devices }; // Set up kernel part of a page table. pde_t* setupkvm(void) { - pde_t *pgdir; + pde_t *pml4; struct kmap *k; - if((pgdir = (pde_t*)kalloc()) == 0) + if((pml4 = (pde_t*)kalloc()) == 0) return 0; - memset(pgdir, 0, PGSIZE); - if (P2V(PHYSTOP) > (void*)DEVSPACE) + memset(pml4, 0, PGSIZE); + if (PHYSTOP > DEVSPACE) panic("PHYSTOP too high"); - for(k = kmap; k < &kmap[NELEM(kmap)]; k++) - if(mappages(pgdir, k->virt, k->phys_end - k->phys_start, + for(k = kmap; k < &kmap[NELEM(kmap)]; k++) { + if(mappages(pml4, k->virt, k->phys_end - k->phys_start, (uint)k->phys_start, k->perm) < 0) { - freevm(pgdir); + freevm(pml4, 0); return 0; } - return pgdir; + } + return pml4; } // Allocate one page table for the machine for the kernel address @@ -140,7 +203,7 @@ setupkvm(void) void kvmalloc(void) { - kpgdir = setupkvm(); + kpml4 = setupkvm(); switchkvm(); } @@ -149,13 +212,17 @@ kvmalloc(void) void switchkvm(void) { - lcr3(V2P(kpgdir)); // switch to the kernel page table + lcr3(V2P(kpml4)); // switch to the kernel page table } + // Switch TSS and h/w page table to correspond to process p. void switchuvm(struct proc *p) { + struct desctr dtr; + struct cpu *c; + if(p == 0) panic("switchuvm: no process"); if(p->kstack == 0) @@ -164,16 +231,22 @@ switchuvm(struct proc *p) panic("switchuvm: no pgdir"); pushcli(); - mycpu()->gdt[SEG_TSS] = SEG16(STS_T32A, &mycpu()->ts, - sizeof(mycpu()->ts)-1, 0); - mycpu()->gdt[SEG_TSS].s = 0; - mycpu()->ts.ss0 = SEG_KDATA << 3; - mycpu()->ts.esp0 = (uint)p->kstack + KSTACKSIZE; - // setting IOPL=0 in eflags *and* iomb beyond the tss segment limit - // forbids I/O instructions (e.g., inb and outb) from user space - mycpu()->ts.iomb = (ushort) 0xFFFF; - ltr(SEG_TSS << 3); + + c = mycpu(); + uint64 base = (uint64) &(c->ts); + c->gdt[TSSSEG>>3] = SEGDESC(base, (sizeof(c->ts)-1), SEG_P|SEG_TSS64A); + c->gdt[(TSSSEG>>3)+1] = SEGDESCHI(base); + c->ts.rsp[0] = (uint64) p->kstack + KSTACKSIZE; + c->ts.iomba = (ushort) 0xFFFF; + + dtr.limit = sizeof(c->gdt) - 1; + dtr.base = (uint64)c->gdt; + lgdt((void *)&dtr.limit); + + ltr(TSSSEG); + lcr3(V2P(p->pgdir)); // switch to process's address space + popcli(); } @@ -197,10 +270,11 @@ inituvm(pde_t *pgdir, char *init, uint sz) int loaduvm(pde_t *pgdir, char *addr, struct inode *ip, uint offset, uint sz) { - uint i, pa, n; + uint i, n; + uint64 pa; pte_t *pte; - if((uint) addr % PGSIZE != 0) + if((uint64) addr % PGSIZE != 0) panic("loaduvm: addr must be page aligned"); for(i = 0; i < sz; i += PGSIZE){ if((pte = walkpgdir(pgdir, addr+i, 0)) == 0) @@ -222,7 +296,7 @@ int allocuvm(pde_t *pgdir, uint oldsz, uint newsz) { char *mem; - uint a; + uint64 a; if(newsz >= KERNBASE) return 0; @@ -233,13 +307,11 @@ allocuvm(pde_t *pgdir, uint oldsz, uint newsz) for(; a < newsz; a += PGSIZE){ mem = kalloc(); if(mem == 0){ - cprintf("allocuvm out of memory\n"); deallocuvm(pgdir, newsz, oldsz); return 0; } memset(mem, 0, PGSIZE); if(mappages(pgdir, (char*)a, PGSIZE, V2P(mem), PTE_W|PTE_U) < 0){ - cprintf("allocuvm out of memory (2)\n"); deallocuvm(pgdir, newsz, oldsz); kfree(mem); return 0; @@ -253,10 +325,10 @@ allocuvm(pde_t *pgdir, uint oldsz, uint newsz) // need to be less than oldsz. oldsz can be larger than the actual // process size. Returns the new process size. int -deallocuvm(pde_t *pgdir, uint oldsz, uint newsz) +deallocuvm(pde_t *pgdir, uint64 oldsz, uint64 newsz) { pte_t *pte; - uint a, pa; + uint64 a, pa; if(newsz >= oldsz) return oldsz; @@ -281,20 +353,34 @@ deallocuvm(pde_t *pgdir, uint oldsz, uint newsz) // Free a page table and all the physical memory pages // in the user part. void -freevm(pde_t *pgdir) +freevm(pde_t *pml4, uint64 sz) { - uint i; + uint i, j, k; + pde_t *pdp, *pd, *pt; - if(pgdir == 0) + if(pml4 == 0) panic("freevm: no pgdir"); - deallocuvm(pgdir, KERNBASE, 0); + + deallocuvm(pml4, sz, 0); for(i = 0; i < NPDENTRIES; i++){ - if(pgdir[i] & PTE_P){ - char * v = P2V(PTE_ADDR(pgdir[i])); - kfree(v); + if(pml4[i] & PTE_P){ + pdp = (pdpe_t*)P2V(PTE_ADDR(pml4[i])); + for(j = 0; j < NPDENTRIES; j++){ + if(pdp[j] & PTE_P){ + pd = (pde_t*)P2V(PTE_ADDR(pdp[j])); + for(k = 0; k < NPDENTRIES; k++){ + if(pd[k] & PTE_P) { + pt = (pde_t*)P2V(PTE_ADDR(pd[k])); + kfree((char*)pt); + } + } + kfree((char*)pd); + } + } + kfree((char*)pdp); } } - kfree((char*)pgdir); + kfree((char*)pml4); } // Clear PTE_U on a page. Used to create an inaccessible @@ -317,7 +403,8 @@ copyuvm(pde_t *pgdir, uint sz) { pde_t *d; pte_t *pte; - uint pa, i, flags; + uint64 pa, i; + uint flags; char *mem; if((d = setupkvm()) == 0) @@ -340,7 +427,7 @@ copyuvm(pde_t *pgdir, uint sz) return d; bad: - freevm(d); + freevm(d, sz); return 0; } @@ -366,7 +453,7 @@ int copyout(pde_t *pgdir, uint va, void *p, uint len) { char *buf, *pa0; - uint n, va0; + uint64 n, va0; buf = (char*)p; while(len > 0){ diff --git a/x86.h b/x86.h index 07312a5..17bec0d 100644 --- a/x86.h +++ b/x86.h @@ -1,5 +1,7 @@ // Routines to let C code use special x86 instructions. +#ifndef __ASSEMBLER__ + static inline uchar inb(ushort port) { @@ -57,32 +59,16 @@ stosl(void *addr, int data, int cnt) "memory", "cc"); } -struct segdesc; - static inline void -lgdt(struct segdesc *p, int size) +lgdt(void *p) { - volatile ushort pd[3]; - - pd[0] = size-1; - pd[1] = (uint)p; - pd[2] = (uint)p >> 16; - - asm volatile("lgdt (%0)" : : "r" (pd)); + asm volatile("lgdt (%0)" : : "r" (p) : "memory"); } -struct gatedesc; - static inline void -lidt(struct gatedesc *p, int size) +lidt(void *p) { - volatile ushort pd[3]; - - pd[0] = size-1; - pd[1] = (uint)p; - pd[2] = (uint)p >> 16; - - asm volatile("lidt (%0)" : : "r" (pd)); + asm volatile("lidt (%0)" : : "r" (p) : "memory"); } static inline void @@ -91,11 +77,11 @@ ltr(ushort sel) asm volatile("ltr %0" : : "r" (sel)); } -static inline uint +static inline uint64 readeflags(void) { - uint eflags; - asm volatile("pushfl; popl %0" : "=r" (eflags)); + uint64 eflags; + asm volatile("pushf; pop %0" : "=r" (eflags)); return eflags; } @@ -133,51 +119,53 @@ xchg(volatile uint *addr, uint newval) static inline uint rcr2(void) { - uint val; - asm volatile("movl %%cr2,%0" : "=r" (val)); + uint64 val; + asm volatile("mov %%cr2,%0" : "=r" (val)); return val; } static inline void -lcr3(uint val) +lcr3(uint64 val) { - asm volatile("movl %0,%%cr3" : : "r" (val)); + asm volatile("mov %0,%%cr3" : : "r" (val)); } +static inline void +writegs(uint16 v) +{ + __asm volatile("movw %0, %%gs" : : "r" (v)); +} + + //PAGEBREAK: 36 // Layout of the trap frame built on the stack by the // hardware and by trapasm.S, and passed to trap(). struct trapframe { - // registers as pushed by pusha - uint edi; - uint esi; - uint ebp; - uint oesp; // useless & ignored - uint ebx; - uint edx; - uint ecx; - uint eax; + uint64 rax; + uint64 rbx; + uint64 rcx; + uint64 rdx; + uint64 rbp; + uint64 rsi; + uint64 rdi; + uint64 r8; + uint64 r9; + uint64 r10; + uint64 r11; + uint64 r12; + uint64 r13; + uint64 r14; + uint64 r15; + uint64 trapno; + uint64 err; + uint64 rip; + uint16 cs; + uint16 padding[3]; + uint64 rflags; + uint64 rsp; + uint64 ss; +}__attribute__((packed)); - // rest of trap frame - ushort gs; - ushort padding1; - ushort fs; - ushort padding2; - ushort es; - ushort padding3; - ushort ds; - ushort padding4; - uint trapno; +#endif - // below here defined by x86 hardware - uint err; - uint eip; - ushort cs; - ushort padding5; - uint eflags; - - // below here only when crossing rings, such as from user to kernel - uint esp; - ushort ss; - ushort padding6; -}; +#define TF_CS 144 // offset in trapframe for saved cs