From de9d72c9086ec935d5b2b889f50ff611135f80fa Mon Sep 17 00:00:00 2001
From: Robert Morris <rtm@csail.mit.edu>
Date: Thu, 13 Jun 2019 06:49:02 -0400
Subject: [PATCH] virtio disk driver

---
 Makefile             |   4 +-
 kernel/bio.c         |   6 +-
 kernel/defs.h        |   5 +
 kernel/kalloc.c      |   1 +
 kernel/kernel.ld     |   1 +
 kernel/main.c        |   3 +-
 kernel/memlayout.h   |   6 +-
 kernel/plic.c        |   7 +-
 kernel/trap.c        |   2 +
 kernel/virtio.h      |  59 ++++++++++
 kernel/virtio_disk.c | 268 +++++++++++++++++++++++++++++++++++++++++++
 kernel/vm.c          |   4 +
 12 files changed, 357 insertions(+), 9 deletions(-)
 create mode 100644 kernel/virtio.h
 create mode 100644 kernel/virtio_disk.c

diff --git a/Makefile b/Makefile
index 545f28c..7580ad5 100644
--- a/Makefile
+++ b/Makefile
@@ -27,7 +27,8 @@ OBJS = \
   $K/exec.o \
   $K/sysfile.o \
   $K/kernelvec.o \
-  $K/plic.o
+  $K/plic.o \
+  $K/virtio_disk.o
 
 # riscv64-unknown-elf- or riscv64-linux-gnu-
 # perhaps in /opt/riscv/bin
@@ -163,6 +164,7 @@ CPUS := 3
 endif
 QEMUOPTS = -machine virt -kernel $K/kernel -m 3G -smp $(CPUS) -nographic
 QEMUOPTS += -initrd fs.img
+QEMUOPTS += -drive file=fs.img,if=none,format=raw,id=x0 -device virtio-blk-device,drive=x0,bus=virtio-mmio-bus.0
 
 qemu: $K/kernel fs.img
 	$(QEMU) $(QEMUOPTS)
diff --git a/kernel/bio.c b/kernel/bio.c
index 90f9af9..07ea030 100644
--- a/kernel/bio.c
+++ b/kernel/bio.c
@@ -101,7 +101,8 @@ bread(uint dev, uint blockno)
 
   b = bget(dev, blockno);
   if((b->flags & B_VALID) == 0) {
-    ramdiskrw(b);
+    //ramdiskrw(b);
+    virtio_disk_rw(b);
   }
   return b;
 }
@@ -113,7 +114,8 @@ bwrite(struct buf *b)
   if(!holdingsleep(&b->lock))
     panic("bwrite");
   b->flags |= B_DIRTY;
-  ramdiskrw(b);
+  //ramdiskrw(b);
+  virtio_disk_rw(b);
 }
 
 // Release a locked buffer.
diff --git a/kernel/defs.h b/kernel/defs.h
index 597e5b6..1b397fe 100644
--- a/kernel/defs.h
+++ b/kernel/defs.h
@@ -201,5 +201,10 @@ uint64          plic_pending(void);
 int             plic_claim(void);
 void            plic_complete(int);
 
+// virtio_disk.c
+void            virtio_disk_init(void);
+void            virtio_disk_rw(struct buf *);
+void            virtio_disk_intr();
+
 // number of elements in fixed-size array
 #define NELEM(x) (sizeof(x)/sizeof((x)[0]))
diff --git a/kernel/kalloc.c b/kernel/kalloc.c
index 1ed1c49..afadb02 100644
--- a/kernel/kalloc.c
+++ b/kernel/kalloc.c
@@ -35,6 +35,7 @@ freerange(void *pa_start, void *pa_end)
 {
   char *p;
   p = (char*)PGROUNDUP((uint64)pa_start);
+  p += 4096; // XXX I can't get kernel.ld to place end beyond the last bss symbol.
   for(; p + PGSIZE <= (char*)pa_end; p += PGSIZE)
     kfree(p);
 }
diff --git a/kernel/kernel.ld b/kernel/kernel.ld
index 53c9b90..dec8e4f 100644
--- a/kernel/kernel.ld
+++ b/kernel/kernel.ld
@@ -28,4 +28,5 @@ SECTIONS
     *(.bss)
     PROVIDE(end = .);
   }
+
 }
diff --git a/kernel/main.c b/kernel/main.c
index 2168b9f..d44c82c 100644
--- a/kernel/main.c
+++ b/kernel/main.c
@@ -26,7 +26,8 @@ main()
     plicinithart();  // ask PLIC for device interrupts
     binit();         // buffer cache
     fileinit();      // file table
-    ramdiskinit();   // disk
+    virtio_disk_init(); // emulated hard disk
+    ramdiskinit();   // in-memory disk
     userinit();      // first user process
     started = 1;
   } else {
diff --git a/kernel/memlayout.h b/kernel/memlayout.h
index 462986c..6d86166 100644
--- a/kernel/memlayout.h
+++ b/kernel/memlayout.h
@@ -6,7 +6,8 @@
 // 00001000 -- boot ROM, provided by qemu
 // 02000000 -- CLINT
 // 0C000000 -- PLIC
-// 10000000 -- uart0 registers
+// 10000000 -- uart0 
+// 10001000 -- virtio disk 
 // 80000000 -- boot ROM jumps here in machine mode
 //             -kernel loads the kernel here
 // 88000000 -- -initrd fs.img ramdisk image.
@@ -21,6 +22,9 @@
 #define UART0 0x10000000L
 #define UART0_IRQ 10
 
+#define VIRTIO 0x10001000
+#define VIRTIO_IRQ 1 // really the first of 8 units
+
 // local interrupt controller, which contains the timer.
 #define CLINT 0x2000000L
 #define CLINT_MTIMECMP(hartid) (CLINT + 0x4000 + 8*(hartid))
diff --git a/kernel/plic.c b/kernel/plic.c
index 0f19ab0..cc9a97e 100644
--- a/kernel/plic.c
+++ b/kernel/plic.c
@@ -11,8 +11,9 @@
 void
 plicinit(void)
 {
-  // set uart's priority to be non-zero (otherwise disabled).
+  // set desired IRQ priorities non-zero (otherwise disabled).
   *(uint32*)(PLIC + UART0_IRQ*4) = 1;
+  *(uint32*)(PLIC + VIRTIO_IRQ*4) = 1;
 }
 
 void
@@ -21,11 +22,9 @@ plicinithart(void)
   int hart = cpuid();
   
   // set uart's enable bit for this hart's S-mode. 
-  //*(uint32*)(PLIC + 0x2080)= (1 << UART0_IRQ);
-  *(uint32*)PLIC_SENABLE(hart)= (1 << UART0_IRQ);
+  *(uint32*)PLIC_SENABLE(hart)= (1 << UART0_IRQ) | (1 << VIRTIO_IRQ);
 
   // set this hart's S-mode priority threshold to 0.
-  //*(uint32*)(PLIC + 0x201000) = 0;
   *(uint32*)PLIC_SPRIORITY(hart) = 0;
 }
 
diff --git a/kernel/trap.c b/kernel/trap.c
index 050a94d..13ad362 100644
--- a/kernel/trap.c
+++ b/kernel/trap.c
@@ -159,6 +159,8 @@ devintr()
 
     if(irq == UART0_IRQ){
       uartintr();
+    } else if(irq == VIRTIO_IRQ){
+      virtio_disk_intr();
     }
 
     plic_complete(irq);
diff --git a/kernel/virtio.h b/kernel/virtio.h
new file mode 100644
index 0000000..258d107
--- /dev/null
+++ b/kernel/virtio.h
@@ -0,0 +1,59 @@
+//
+// virtio device definitions.
+// for both the mmio interface, and virtio descriptors.
+// only tested with qemu.
+// this is the "legacy" virtio interface.
+//
+
+// virtio mmio control registers, mapped starting at 0x10001000.
+// from qemu virtio_mmio.h
+#define VIRTIO_MMIO_MAGIC_VALUE		0x000 // 0x74726976
+#define VIRTIO_MMIO_VERSION		0x004 // 1 -- version, 1 is legacy
+#define VIRTIO_MMIO_DEVICE_ID		0x008 // 2 -- block device type
+#define VIRTIO_MMIO_VENDOR_ID		0x00c // 0x554d4551
+#define VIRTIO_MMIO_DEVICE_FEATURES	0x010
+#define VIRTIO_MMIO_DRIVER_FEATURES	0x020
+#define VIRTIO_MMIO_GUEST_PAGE_SIZE	0x028 // page size for PFN, write-only
+#define VIRTIO_MMIO_QUEUE_SEL		0x030 // select queue, write-only
+#define VIRTIO_MMIO_QUEUE_NUM_MAX	0x034 // max size of current queue, read-only
+#define VIRTIO_MMIO_QUEUE_NUM		0x038 // size of current queue, write-only
+#define VIRTIO_MMIO_QUEUE_ALIGN		0x03c // used ring alignment, write-only
+#define VIRTIO_MMIO_QUEUE_PFN		0x040 // physical page number for queue, read/write
+#define VIRTIO_MMIO_QUEUE_READY		0x044 // ready bit
+#define VIRTIO_MMIO_QUEUE_NOTIFY	0x050 // write-only
+#define VIRTIO_MMIO_INTERRUPT_STATUS	0x060 // read-only
+#define VIRTIO_MMIO_INTERRUPT_ACK	0x064 // write-only
+#define VIRTIO_MMIO_STATUS		0x070 // read/write
+
+// status register bits, from qemu virtio_config.h
+#define VIRTIO_CONFIG_S_ACKNOWLEDGE	1
+#define VIRTIO_CONFIG_S_DRIVER		2
+#define VIRTIO_CONFIG_S_DRIVER_OK	4
+#define VIRTIO_CONFIG_S_FEATURES_OK	8
+
+// device feature bits
+#define VIRTIO_BLK_F_RO		5	/* Disk is read-only */
+#define VIRTIO_BLK_F_SCSI	7	/* Supports scsi command passthru */
+#define VIRTIO_BLK_F_CONFIG_WCE	11	/* Writeback mode available in config */
+#define VIRTIO_BLK_F_MQ		12	/* support more than one vq */
+#define VIRTIO_F_ANY_LAYOUT		27
+#define VIRTIO_RING_F_INDIRECT_DESC	28
+#define VIRTIO_RING_F_EVENT_IDX		29
+
+struct VRingDesc {
+  uint64 addr;
+  uint32 len;
+  uint16 flags;
+  uint16 next;
+};
+#define VRING_DESC_F_NEXT	1
+#define VRING_DESC_F_WRITE	2 // device writes (vs read)
+
+struct VRingUsedElem {
+  uint32 id; // index of start of completed descriptor chain
+  uint32 len;
+};
+
+// for disk ops
+#define VIRTIO_BLK_T_IN		0
+#define VIRTIO_BLK_T_OUT	1
diff --git a/kernel/virtio_disk.c b/kernel/virtio_disk.c
new file mode 100644
index 0000000..558d3b0
--- /dev/null
+++ b/kernel/virtio_disk.c
@@ -0,0 +1,268 @@
+//
+// driver for qemu's virtio disk device.
+// uses qemu's mmio interface to virtio.
+// qemu presents a "legacy" virtio interface.
+//
+// qemu ... -drive file=fs.img,if=none,format=raw,id=x0 -device virtio-blk-device,drive=x0,bus=virtio-mmio-bus.0
+//
+
+#include "types.h"
+#include "riscv.h"
+#include "defs.h"
+#include "param.h"
+#include "memlayout.h"
+#include "spinlock.h"
+#include "sleeplock.h"
+#include "fs.h"
+#include "buf.h"
+#include "virtio.h"
+
+// the address of a virtio mmio register.
+#define R(off) ((volatile uint32 *)(VIRTIO + (off)))
+
+struct spinlock virtio_disk_lock;
+
+// this many virtio descriptors.
+// must be a power of two.
+#define NUM 8
+
+// memory for virtio descriptors &c for queue 0.
+// this is a global instead of allocated because it has
+// to be multiple contiguous pages, which kalloc()
+// doesn't support.
+__attribute__ ((aligned (PGSIZE)))
+static char pages[2*PGSIZE];
+static struct VRingDesc *desc;
+static uint16 *avail;
+static char *used;
+
+// our own book-keeping.
+static char free[NUM];  // is a descriptor free?
+static uint16 used_idx; // we've looked this far in used[2..NUM].
+
+// track info about in-flight operations,
+// for use when completion interrupt arrives.
+// indexed by first descriptor index of chain.
+static struct {
+  struct buf *b;
+} info[NUM];
+
+void
+virtio_disk_init(void)
+{
+  uint32 status = 0;
+
+  initlock(&virtio_disk_lock, "virtio_disk");
+
+  // qemu's virtio-mmio.c
+
+  if(*R(VIRTIO_MMIO_MAGIC_VALUE) != 0x74726976 ||
+     *R(VIRTIO_MMIO_VERSION) != 1 ||
+     *R(VIRTIO_MMIO_DEVICE_ID) != 2 ||
+     *R(VIRTIO_MMIO_VENDOR_ID) != 0x554d4551){
+    panic("could not find virtio disk");
+  }
+  
+  status |= VIRTIO_CONFIG_S_ACKNOWLEDGE;
+  *R(VIRTIO_MMIO_STATUS) = status;
+
+  status |= VIRTIO_CONFIG_S_DRIVER;
+  *R(VIRTIO_MMIO_STATUS) = status;
+
+  // negotiate features
+  uint64 features = *R(VIRTIO_MMIO_DEVICE_FEATURES);
+  features &= ~(1 << VIRTIO_BLK_F_RO);
+  features &= ~(1 << VIRTIO_BLK_F_SCSI);
+  features &= ~(1 << VIRTIO_BLK_F_CONFIG_WCE);
+  features &= ~(1 << VIRTIO_BLK_F_MQ);
+  features &= ~(1 << VIRTIO_F_ANY_LAYOUT);
+  features &= ~(1 << VIRTIO_RING_F_EVENT_IDX);
+  features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC);
+  *R(VIRTIO_MMIO_DRIVER_FEATURES) = features;
+
+  // tell device that feature negotiation is complete.
+  status |= VIRTIO_CONFIG_S_FEATURES_OK;
+  *R(VIRTIO_MMIO_STATUS) = status;
+
+  // tell device we're completely ready.
+  status |= VIRTIO_CONFIG_S_DRIVER_OK;
+  *R(VIRTIO_MMIO_STATUS) = status;
+
+  *R(VIRTIO_MMIO_GUEST_PAGE_SIZE) = PGSIZE;
+
+  // qemu's hw/virtio/virtio.c
+
+  // initialize queue 0
+  *R(VIRTIO_MMIO_QUEUE_SEL) = 0;
+  uint32 max = *R(VIRTIO_MMIO_QUEUE_NUM_MAX);
+  if(max == 0)
+    panic("virtio disk has no queue 0");
+  if(max < NUM)
+    panic("virtio disk max queue too short");
+  *R(VIRTIO_MMIO_QUEUE_NUM) = NUM;
+  memset(pages, 0, sizeof(pages));
+  *R(VIRTIO_MMIO_QUEUE_PFN) = ((uint64)pages) >> PGSHIFT;
+
+  // desc = pages -- num * VRingDesc
+  // avail = pages + 0x40 -- 2 * uint16, then num * uint16
+  // used = pages + 4096 -- 2 * uint16, then num * vRingUsedElem
+
+  desc = (struct VRingDesc *) pages;
+  avail = (uint16*)(((char*)desc) + NUM*sizeof(struct VRingDesc));
+  used = pages + PGSIZE;
+
+  for(int i = 0; i < NUM; i++)
+    free[i] = 1;
+}
+
+// find a free descriptor, mark it non-free, return its index.
+static int
+alloc_desc()
+{
+  for(int i = 0; i < NUM; i++){
+    if(free[i]){
+      free[i] = 0;
+      return i;
+    }
+  }
+  return -1;
+}
+
+void
+free_desc(int i)
+{
+  if(i >= NUM)
+    panic("virtio_disk_intr 1");
+  if(free[i])
+    panic("virtio_disk_intr 2");
+  free[i] = 1;
+}
+
+void
+virtio_disk_rw(struct buf *b)
+{
+  uint64 sector = b->blockno * (BSIZE / 512);
+
+  acquire(&virtio_disk_lock);
+
+  // the spec says that legacy block operations always use three
+  // descriptors: one for type/reserved/sector, one for
+  // the data, one for a 1-byte status result.
+
+  // allocate the three descriptors.
+  int idx[3];
+  while(1){
+    int done = 1;
+    for(int i = 0; i < 3; i++){
+      idx[i] = alloc_desc();
+      if(idx[i] < 0){
+        for(int j = 0; j < i; j++)
+          free_desc(idx[j]);
+        wakeup(&free[0]);
+        done = 0;
+        break;
+      }
+    }
+    if(done)
+      break;
+    sleep(&free[0], &virtio_disk_lock);
+  }
+
+  // format the three descriptors.
+  // qemu's virtio-blk.c reads them.
+
+  struct virtio_blk_outhdr {
+    uint32 type;
+    uint32 reserved;
+    uint64 sector;
+  } buf0;
+
+  if(b->flags & B_DIRTY)
+    buf0.type = VIRTIO_BLK_T_OUT; // write the disk
+  else
+    buf0.type = VIRTIO_BLK_T_IN; // read the disk
+  buf0.reserved = 0;
+  buf0.sector = sector;
+
+  desc[idx[0]].addr = (uint64) &buf0;
+  desc[idx[0]].len = sizeof(buf0);
+  desc[idx[0]].flags = VRING_DESC_F_NEXT;
+  desc[idx[0]].next = idx[1];
+
+  desc[idx[1]].addr = (uint64) b->data;
+  desc[idx[1]].len = BSIZE;
+  if(b->flags & B_DIRTY)
+    desc[idx[1]].flags = 0; // device reads b->data
+  else
+    desc[idx[1]].flags = VRING_DESC_F_WRITE; // device writes b->data
+  desc[idx[1]].flags |= VRING_DESC_F_NEXT;
+  desc[idx[1]].next = idx[2];
+
+  char status = 0;
+  desc[idx[2]].addr = (uint64) &status;
+  desc[idx[2]].len = 1;
+  desc[idx[2]].flags = VRING_DESC_F_WRITE; // device writes the status
+  desc[idx[2]].next = 0;
+
+  // record struct buf for virtio_disk_intr().
+  info[idx[0]].b = b;
+
+  // avail[0] is flags
+  // avail[1] tells the device how far to look in avail[2...].
+  // avail[2...] are desc[] indices the device should process.
+  // we only tell device the first index in our chain of descriptors.
+  avail[2 + (avail[1] % NUM)] = idx[0];
+  __sync_synchronize();
+  avail[1] = avail[1] + 1;
+
+  *R(VIRTIO_MMIO_QUEUE_NOTIFY) = 0; // value is queue number
+
+  // Wait for virtio_disk_intr() to say request has finished.
+  while((b->flags & (B_VALID|B_DIRTY)) != B_VALID){
+    sleep(b, &virtio_disk_lock);
+  }
+
+  release(&virtio_disk_lock);
+}
+
+void
+virtio_disk_intr()
+{
+  // the used area is:
+  // uint16 flags
+  // uint16 idx
+  // array of VRingUsedElem
+
+  // XXX spec says to read INTERRUPT_STATUS and
+  // write INTERRUPT_ACK
+
+  acquire(&virtio_disk_lock);
+  
+  while((used_idx % NUM) != (*(volatile uint16 *)(used+2) % NUM)){
+    struct VRingUsedElem *ue = (struct VRingUsedElem *) (used + 4 + 8*used_idx);
+
+    // XXX check the one-byte status in the 3rd descriptor.
+
+    info[ue->id].b->flags |= B_VALID;
+    info[ue->id].b->flags &= ~B_DIRTY;
+
+    wakeup(info[ue->id].b);
+
+    info[ue->id].b = 0;
+
+    uint i = ue->id;
+    while(1){
+      desc[i].addr = 0;
+      free_desc(i);
+      if(desc[i].flags & VRING_DESC_F_NEXT)
+        i = desc[i].next;
+      else
+        break;
+    }
+    wakeup(&free[0]);
+
+    used_idx = (used_idx + 1) % NUM;
+  }
+
+  release(&virtio_disk_lock);
+}
diff --git a/kernel/vm.c b/kernel/vm.c
index 0ea6bca..0d0a9d9 100644
--- a/kernel/vm.c
+++ b/kernel/vm.c
@@ -30,6 +30,10 @@ kvminit()
   mappages(kernel_pagetable, UART0, PGSIZE,
            UART0, PTE_R | PTE_W);
 
+  // virtio disk interface
+  mappages(kernel_pagetable, VIRTIO, PGSIZE,
+           VIRTIO, PTE_R | PTE_W);
+
   // CLINT
   mappages(kernel_pagetable, CLINT, 0x10000,
            CLINT, PTE_R | PTE_W);