xv6-riscv-kernel/fs.c
Frans Kaashoek c24ac5d763 Disentangle block size from the disk's sector size. Set block size to 1024 to show
that they can be different.  Clean up mkfs, simplifying specifying fs parameters,
remove some redundancy between fs and mkfs, and fix disk layout bugs. Call blocks
in the file system blocks instead of sectors.  Passes usertests for different
block sizes.
2015-04-03 08:22:02 -04:00

653 lines
15 KiB
C

// File system implementation. Five layers:
// + Blocks: allocator for raw disk blocks.
// + Log: crash recovery for multi-step updates.
// + Files: inode allocator, reading, writing, metadata.
// + Directories: inode with special contents (list of other inodes!)
// + Names: paths like /usr/rtm/xv6/fs.c for convenient naming.
//
// This file contains the low-level file system manipulation
// routines. The (higher-level) system call implementations
// are in sysfile.c.
#include "types.h"
#include "defs.h"
#include "param.h"
#include "stat.h"
#include "mmu.h"
#include "proc.h"
#include "spinlock.h"
#include "fs.h"
#include "buf.h"
#include "file.h"
#define min(a, b) ((a) < (b) ? (a) : (b))
static void itrunc(struct inode*);
// Read the super block.
void
readsb(int dev, struct superblock *sb)
{
struct buf *bp;
bp = bread(dev, 1);
memmove(sb, bp->data, sizeof(*sb));
brelse(bp);
}
// Zero a block.
static void
bzero(int dev, int bno)
{
struct buf *bp;
bp = bread(dev, bno);
memset(bp->data, 0, BSIZE);
log_write(bp);
brelse(bp);
}
// Blocks.
// Allocate a zeroed disk block.
static uint
balloc(uint dev)
{
int b, bi, m;
struct buf *bp;
struct superblock sb;
bp = 0;
readsb(dev, &sb);
for(b = 0; b < sb.size; b += BPB){
bp = bread(dev, BBLOCK(b, sb.ninodes));
for(bi = 0; bi < BPB && b + bi < sb.size; bi++){
m = 1 << (bi % 8);
if((bp->data[bi/8] & m) == 0){ // Is block free?
bp->data[bi/8] |= m; // Mark block in use.
log_write(bp);
brelse(bp);
bzero(dev, b + bi);
return b + bi;
}
}
brelse(bp);
}
panic("balloc: out of blocks");
}
// Free a disk block.
static void
bfree(int dev, uint b)
{
struct buf *bp;
struct superblock sb;
int bi, m;
readsb(dev, &sb);
bp = bread(dev, BBLOCK(b, sb.ninodes));
bi = b % BPB;
m = 1 << (bi % 8);
if((bp->data[bi/8] & m) == 0)
panic("freeing free block");
bp->data[bi/8] &= ~m;
log_write(bp);
brelse(bp);
}
// Inodes.
//
// An inode describes a single unnamed file.
// The inode disk structure holds metadata: the file's type,
// its size, the number of links referring to it, and the
// list of blocks holding the file's content.
//
// The inodes are laid out sequentially on disk immediately after
// the superblock. Each inode has a number, indicating its
// position on the disk.
//
// The kernel keeps a cache of in-use inodes in memory
// to provide a place for synchronizing access
// to inodes used by multiple processes. The cached
// inodes include book-keeping information that is
// not stored on disk: ip->ref and ip->flags.
//
// An inode and its in-memory represtative go through a
// sequence of states before they can be used by the
// rest of the file system code.
//
// * Allocation: an inode is allocated if its type (on disk)
// is non-zero. ialloc() allocates, iput() frees if
// the link count has fallen to zero.
//
// * Referencing in cache: an entry in the inode cache
// is free if ip->ref is zero. Otherwise ip->ref tracks
// the number of in-memory pointers to the entry (open
// files and current directories). iget() to find or
// create a cache entry and increment its ref, iput()
// to decrement ref.
//
// * Valid: the information (type, size, &c) in an inode
// cache entry is only correct when the I_VALID bit
// is set in ip->flags. ilock() reads the inode from
// the disk and sets I_VALID, while iput() clears
// I_VALID if ip->ref has fallen to zero.
//
// * Locked: file system code may only examine and modify
// the information in an inode and its content if it
// has first locked the inode. The I_BUSY flag indicates
// that the inode is locked. ilock() sets I_BUSY,
// while iunlock clears it.
//
// Thus a typical sequence is:
// ip = iget(dev, inum)
// ilock(ip)
// ... examine and modify ip->xxx ...
// iunlock(ip)
// iput(ip)
//
// ilock() is separate from iget() so that system calls can
// get a long-term reference to an inode (as for an open file)
// and only lock it for short periods (e.g., in read()).
// The separation also helps avoid deadlock and races during
// pathname lookup. iget() increments ip->ref so that the inode
// stays cached and pointers to it remain valid.
//
// Many internal file system functions expect the caller to
// have locked the inodes involved; this lets callers create
// multi-step atomic operations.
struct {
struct spinlock lock;
struct inode inode[NINODE];
} icache;
void
iinit(void)
{
initlock(&icache.lock, "icache");
}
static struct inode* iget(uint dev, uint inum);
//PAGEBREAK!
// Allocate a new inode with the given type on device dev.
// A free inode has a type of zero.
struct inode*
ialloc(uint dev, short type)
{
int inum;
struct buf *bp;
struct dinode *dip;
struct superblock sb;
readsb(dev, &sb);
for(inum = 1; inum < sb.ninodes; inum++){
bp = bread(dev, IBLOCK(inum));
dip = (struct dinode*)bp->data + inum%IPB;
if(dip->type == 0){ // a free inode
memset(dip, 0, sizeof(*dip));
dip->type = type;
log_write(bp); // mark it allocated on the disk
brelse(bp);
return iget(dev, inum);
}
brelse(bp);
}
panic("ialloc: no inodes");
}
// Copy a modified in-memory inode to disk.
void
iupdate(struct inode *ip)
{
struct buf *bp;
struct dinode *dip;
bp = bread(ip->dev, IBLOCK(ip->inum));
dip = (struct dinode*)bp->data + ip->inum%IPB;
dip->type = ip->type;
dip->major = ip->major;
dip->minor = ip->minor;
dip->nlink = ip->nlink;
dip->size = ip->size;
memmove(dip->addrs, ip->addrs, sizeof(ip->addrs));
log_write(bp);
brelse(bp);
}
// Find the inode with number inum on device dev
// and return the in-memory copy. Does not lock
// the inode and does not read it from disk.
static struct inode*
iget(uint dev, uint inum)
{
struct inode *ip, *empty;
acquire(&icache.lock);
// Is the inode already cached?
empty = 0;
for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){
if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){
ip->ref++;
release(&icache.lock);
return ip;
}
if(empty == 0 && ip->ref == 0) // Remember empty slot.
empty = ip;
}
// Recycle an inode cache entry.
if(empty == 0)
panic("iget: no inodes");
ip = empty;
ip->dev = dev;
ip->inum = inum;
ip->ref = 1;
ip->flags = 0;
release(&icache.lock);
return ip;
}
// Increment reference count for ip.
// Returns ip to enable ip = idup(ip1) idiom.
struct inode*
idup(struct inode *ip)
{
acquire(&icache.lock);
ip->ref++;
release(&icache.lock);
return ip;
}
// Lock the given inode.
// Reads the inode from disk if necessary.
void
ilock(struct inode *ip)
{
struct buf *bp;
struct dinode *dip;
if(ip == 0 || ip->ref < 1)
panic("ilock");
acquire(&icache.lock);
while(ip->flags & I_BUSY)
sleep(ip, &icache.lock);
ip->flags |= I_BUSY;
release(&icache.lock);
if(!(ip->flags & I_VALID)){
bp = bread(ip->dev, IBLOCK(ip->inum));
dip = (struct dinode*)bp->data + ip->inum%IPB;
ip->type = dip->type;
ip->major = dip->major;
ip->minor = dip->minor;
ip->nlink = dip->nlink;
ip->size = dip->size;
memmove(ip->addrs, dip->addrs, sizeof(ip->addrs));
brelse(bp);
ip->flags |= I_VALID;
if(ip->type == 0)
panic("ilock: no type");
}
}
// Unlock the given inode.
void
iunlock(struct inode *ip)
{
if(ip == 0 || !(ip->flags & I_BUSY) || ip->ref < 1)
panic("iunlock");
acquire(&icache.lock);
ip->flags &= ~I_BUSY;
wakeup(ip);
release(&icache.lock);
}
// Drop a reference to an in-memory inode.
// If that was the last reference, the inode cache entry can
// be recycled.
// If that was the last reference and the inode has no links
// to it, free the inode (and its content) on disk.
// All calls to iput() must be inside a transaction in
// case it has to free the inode.
void
iput(struct inode *ip)
{
acquire(&icache.lock);
if(ip->ref == 1 && (ip->flags & I_VALID) && ip->nlink == 0){
// inode has no links and no other references: truncate and free.
if(ip->flags & I_BUSY)
panic("iput busy");
ip->flags |= I_BUSY;
release(&icache.lock);
itrunc(ip);
ip->type = 0;
iupdate(ip);
acquire(&icache.lock);
ip->flags = 0;
wakeup(ip);
}
ip->ref--;
release(&icache.lock);
}
// Common idiom: unlock, then put.
void
iunlockput(struct inode *ip)
{
iunlock(ip);
iput(ip);
}
//PAGEBREAK!
// Inode content
//
// The content (data) associated with each inode is stored
// in blocks on the disk. The first NDIRECT block numbers
// are listed in ip->addrs[]. The next NINDIRECT blocks are
// listed in block ip->addrs[NDIRECT].
// Return the disk block address of the nth block in inode ip.
// If there is no such block, bmap allocates one.
static uint
bmap(struct inode *ip, uint bn)
{
uint addr, *a;
struct buf *bp;
if(bn < NDIRECT){
if((addr = ip->addrs[bn]) == 0)
ip->addrs[bn] = addr = balloc(ip->dev);
return addr;
}
bn -= NDIRECT;
if(bn < NINDIRECT){
// Load indirect block, allocating if necessary.
if((addr = ip->addrs[NDIRECT]) == 0)
ip->addrs[NDIRECT] = addr = balloc(ip->dev);
bp = bread(ip->dev, addr);
a = (uint*)bp->data;
if((addr = a[bn]) == 0){
a[bn] = addr = balloc(ip->dev);
log_write(bp);
}
brelse(bp);
return addr;
}
panic("bmap: out of range");
}
// Truncate inode (discard contents).
// Only called when the inode has no links
// to it (no directory entries referring to it)
// and has no in-memory reference to it (is
// not an open file or current directory).
static void
itrunc(struct inode *ip)
{
int i, j;
struct buf *bp;
uint *a;
for(i = 0; i < NDIRECT; i++){
if(ip->addrs[i]){
bfree(ip->dev, ip->addrs[i]);
ip->addrs[i] = 0;
}
}
if(ip->addrs[NDIRECT]){
bp = bread(ip->dev, ip->addrs[NDIRECT]);
a = (uint*)bp->data;
for(j = 0; j < NINDIRECT; j++){
if(a[j])
bfree(ip->dev, a[j]);
}
brelse(bp);
bfree(ip->dev, ip->addrs[NDIRECT]);
ip->addrs[NDIRECT] = 0;
}
ip->size = 0;
iupdate(ip);
}
// Copy stat information from inode.
void
stati(struct inode *ip, struct stat *st)
{
st->dev = ip->dev;
st->ino = ip->inum;
st->type = ip->type;
st->nlink = ip->nlink;
st->size = ip->size;
}
//PAGEBREAK!
// Read data from inode.
int
readi(struct inode *ip, char *dst, uint off, uint n)
{
uint tot, m;
struct buf *bp;
if(ip->type == T_DEV){
if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read)
return -1;
return devsw[ip->major].read(ip, dst, n);
}
if(off > ip->size || off + n < off)
return -1;
if(off + n > ip->size)
n = ip->size - off;
for(tot=0; tot<n; tot+=m, off+=m, dst+=m){
bp = bread(ip->dev, bmap(ip, off/BSIZE));
m = min(n - tot, BSIZE - off%BSIZE);
memmove(dst, bp->data + off%BSIZE, m);
brelse(bp);
}
return n;
}
// PAGEBREAK!
// Write data to inode.
int
writei(struct inode *ip, char *src, uint off, uint n)
{
uint tot, m;
struct buf *bp;
if(ip->type == T_DEV){
if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write)
return -1;
return devsw[ip->major].write(ip, src, n);
}
if(off > ip->size || off + n < off)
return -1;
if(off + n > MAXFILE*BSIZE)
return -1;
for(tot=0; tot<n; tot+=m, off+=m, src+=m){
bp = bread(ip->dev, bmap(ip, off/BSIZE));
m = min(n - tot, BSIZE - off%BSIZE);
memmove(bp->data + off%BSIZE, src, m);
log_write(bp);
brelse(bp);
}
if(n > 0 && off > ip->size){
ip->size = off;
iupdate(ip);
}
return n;
}
//PAGEBREAK!
// Directories
int
namecmp(const char *s, const char *t)
{
return strncmp(s, t, DIRSIZ);
}
// Look for a directory entry in a directory.
// If found, set *poff to byte offset of entry.
struct inode*
dirlookup(struct inode *dp, char *name, uint *poff)
{
uint off, inum;
struct dirent de;
if(dp->type != T_DIR)
panic("dirlookup not DIR");
for(off = 0; off < dp->size; off += sizeof(de)){
if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
panic("dirlink read");
if(de.inum == 0)
continue;
if(namecmp(name, de.name) == 0){
// entry matches path element
if(poff)
*poff = off;
inum = de.inum;
return iget(dp->dev, inum);
}
}
return 0;
}
// Write a new directory entry (name, inum) into the directory dp.
int
dirlink(struct inode *dp, char *name, uint inum)
{
int off;
struct dirent de;
struct inode *ip;
// Check that name is not present.
if((ip = dirlookup(dp, name, 0)) != 0){
iput(ip);
return -1;
}
// Look for an empty dirent.
for(off = 0; off < dp->size; off += sizeof(de)){
if(readi(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
panic("dirlink read");
if(de.inum == 0)
break;
}
strncpy(de.name, name, DIRSIZ);
de.inum = inum;
if(writei(dp, (char*)&de, off, sizeof(de)) != sizeof(de))
panic("dirlink");
return 0;
}
//PAGEBREAK!
// Paths
// Copy the next path element from path into name.
// Return a pointer to the element following the copied one.
// The returned path has no leading slashes,
// so the caller can check *path=='\0' to see if the name is the last one.
// If no name to remove, return 0.
//
// Examples:
// skipelem("a/bb/c", name) = "bb/c", setting name = "a"
// skipelem("///a//bb", name) = "bb", setting name = "a"
// skipelem("a", name) = "", setting name = "a"
// skipelem("", name) = skipelem("////", name) = 0
//
static char*
skipelem(char *path, char *name)
{
char *s;
int len;
while(*path == '/')
path++;
if(*path == 0)
return 0;
s = path;
while(*path != '/' && *path != 0)
path++;
len = path - s;
if(len >= DIRSIZ)
memmove(name, s, DIRSIZ);
else {
memmove(name, s, len);
name[len] = 0;
}
while(*path == '/')
path++;
return path;
}
// Look up and return the inode for a path name.
// If parent != 0, return the inode for the parent and copy the final
// path element into name, which must have room for DIRSIZ bytes.
// Must be called inside a transaction since it calls iput().
static struct inode*
namex(char *path, int nameiparent, char *name)
{
struct inode *ip, *next;
if(*path == '/')
ip = iget(ROOTDEV, ROOTINO);
else
ip = idup(proc->cwd);
while((path = skipelem(path, name)) != 0){
ilock(ip);
if(ip->type != T_DIR){
iunlockput(ip);
return 0;
}
if(nameiparent && *path == '\0'){
// Stop one level early.
iunlock(ip);
return ip;
}
if((next = dirlookup(ip, name, 0)) == 0){
iunlockput(ip);
return 0;
}
iunlockput(ip);
ip = next;
}
if(nameiparent){
iput(ip);
return 0;
}
return ip;
}
struct inode*
namei(char *path)
{
char name[DIRSIZ];
return namex(path, 0, name);
}
struct inode*
nameiparent(char *path, char *name)
{
return namex(path, 1, name);
}