⛏️ index : haiku.git

/*
 * Copyright 2002-2016, Axel Dörfler, axeld@pinc-software.de.
 * Distributed under the terms of the MIT License.
 *
 * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
 * Distributed under the terms of the NewOS License.
 */


#include <fs/devfs.h>

#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>

#include <Drivers.h>
#include <KernelExport.h>
#include <NodeMonitor.h>

#include <arch/cpu.h>
#include <AutoDeleter.h>
#include <boot/kernel_args.h>
#include <boot_device.h>
#include <debug.h>
#include <elf.h>
#include <FindDirectory.h>
#include <fs/devfs.h>
#include <fs/KPath.h>
#include <fs/node_monitor.h>
#include <kdevice_manager.h>
#include <lock.h>
#include <Notifications.h>
#include <util/AutoLock.h>
#include <util/fs_trim_support.h>
#include <vfs.h>
#include <vm/vm.h>
#include <wait_for_objects.h>

#include "BaseDevice.h"
#include "FileDevice.h"
#include "IORequest.h"
#include "legacy_drivers.h"


//#define TRACE_DEVFS
#ifdef TRACE_DEVFS
#	define TRACE(x) dprintf x
#else
#	define TRACE(x)
#endif


namespace {

struct devfs_partition {
	struct devfs_vnode*	raw_device;
	partition_info		info;
};

struct driver_entry;

enum {
	kNotScanned = 0,
	kBootScan,
	kNormalScan,
};

struct devfs_stream {
	mode_t				type;
	union {
		struct stream_dir {
			struct devfs_vnode*		dir_head;
			struct list				cookies;
			mutex					scan_lock;
			int32					scanned;
		} dir;
		struct stream_dev {
			BaseDevice*				device;
			struct devfs_partition*	partition;
		} dev;
		struct stream_symlink {
			const char*				path;
			size_t					length;
		} symlink;
	} u;
};

struct devfs_vnode {
	struct devfs_vnode*	all_next;
	ino_t				id;
	char*				name;
	timespec			modification_time;
	timespec			creation_time;
	uid_t				uid;
	gid_t				gid;
	struct devfs_vnode*	parent;
	struct devfs_vnode*	dir_next;
	struct devfs_stream	stream;
};

#define DEVFS_HASH_SIZE 16


struct NodeHash {
	typedef ino_t			KeyType;
	typedef	devfs_vnode		ValueType;

	size_t HashKey(KeyType key) const
	{
		return key ^ (key >> 32);
	}

	size_t Hash(ValueType* value) const
	{
		return HashKey(value->id);
	}

	bool Compare(KeyType key, ValueType* value) const
	{
		return value->id == key;
	}

	ValueType*& GetLink(ValueType* value) const
	{
		return value->all_next;
	}
};

typedef BOpenHashTable<NodeHash> NodeTable;

struct devfs {
	dev_t				id;
	fs_volume*			volume;
	recursive_lock		lock;
	int32				next_vnode_id;
	NodeTable*			vnode_hash;
	struct devfs_vnode*	root_vnode;
};

struct devfs_dir_cookie {
	struct list_link	link;
	struct devfs_vnode*	current;
	int32				state;	// iteration state
};

struct devfs_cookie {
	void*				device_cookie;
};

// directory iteration states
enum {
	ITERATION_STATE_DOT		= 0,
	ITERATION_STATE_DOT_DOT	= 1,
	ITERATION_STATE_OTHERS	= 2,
	ITERATION_STATE_BEGIN	= ITERATION_STATE_DOT,
};

// extern only to make forward declaration possible
extern fs_volume_ops kVolumeOps;
extern fs_vnode_ops kVnodeOps;

} // namespace


static status_t get_node_for_path(struct devfs* fs, const char* path,
	struct devfs_vnode** _node);
static void get_device_name(struct devfs_vnode* vnode, char* buffer,
	size_t size);
static status_t unpublish_node(struct devfs* fs, devfs_vnode* node,
	mode_t type);
static status_t publish_device(struct devfs* fs, const char* path,
	BaseDevice* device);


// The one and only allowed devfs instance
static struct devfs* sDeviceFileSystem = NULL;


//	#pragma mark - devfs private


static timespec
current_timespec()
{
	bigtime_t time = real_time_clock_usecs();

	timespec tv;
	tv.tv_sec = time / 1000000;
	tv.tv_nsec = (time % 1000000) * 1000;
	return tv;
}


static ino_t
get_parent_id(struct devfs_vnode* vnode)
{
	if (vnode->parent != NULL)
		return vnode->parent->id;
	return -1;
}


static int32
scan_mode(void)
{
	// We may scan every device twice:
	//  - once before there is a boot device,
	//  - and once when there is one

	return gBootDevice >= 0 ? kNormalScan : kBootScan;
}


static status_t
scan_for_drivers_if_needed(devfs_vnode* dir)
{
	ASSERT(S_ISDIR(dir->stream.type));

	MutexLocker _(dir->stream.u.dir.scan_lock);

	if (dir->stream.u.dir.scanned >= scan_mode())
		return B_OK;

	KPath path;
	if (path.InitCheck() != B_OK)
		return B_NO_MEMORY;

	get_device_name(dir, path.LockBuffer(), path.BufferSize());
	path.UnlockBuffer();

	TRACE(("scan_for_drivers_if_needed: mode %" B_PRId32 ": %s\n",
		scan_mode(), path.Path()));

	// scan for drivers at this path
	static int32 updateCycle = 1;
	device_manager_probe(path.Path(), updateCycle++);
	legacy_driver_probe(path.Path());

	dir->stream.u.dir.scanned = scan_mode();
	return B_OK;
}


static void
init_directory_vnode(struct devfs_vnode* vnode, int permissions)
{
	vnode->stream.type = S_IFDIR | permissions;
		mutex_init(&vnode->stream.u.dir.scan_lock, "devfs scan");
	vnode->stream.u.dir.dir_head = NULL;
	list_init(&vnode->stream.u.dir.cookies);
}


static struct devfs_vnode*
devfs_create_vnode(struct devfs* fs, devfs_vnode* parent, const char* name)
{
	struct devfs_vnode* vnode;

	vnode = (struct devfs_vnode*)malloc(sizeof(struct devfs_vnode));
	if (vnode == NULL)
		return NULL;

	memset(vnode, 0, sizeof(struct devfs_vnode));
	vnode->id = fs->next_vnode_id++;

	vnode->name = strdup(name);
	if (vnode->name == NULL) {
		free(vnode);
		return NULL;
	}

	vnode->creation_time = vnode->modification_time = current_timespec();
	vnode->uid = geteuid();
	vnode->gid = parent ? parent->gid : getegid();
		// inherit group from parent if possible

	return vnode;
}


static status_t
devfs_delete_vnode(struct devfs* fs, struct devfs_vnode* vnode,
	bool forceDelete)
{
	// Can't delete it if it's in a directory or is a directory
	// and has children
	if (!forceDelete && ((S_ISDIR(vnode->stream.type)
				&& vnode->stream.u.dir.dir_head != NULL)
			|| vnode->dir_next != NULL))
		return B_NOT_ALLOWED;

	// remove it from the global hash table
	fs->vnode_hash->Remove(vnode);

	if (S_ISCHR(vnode->stream.type)) {
		if (vnode->stream.u.dev.partition == NULL) {
			// pass the call through to the underlying device
			vnode->stream.u.dev.device->Removed();
		} else {
			// for partitions, we have to release the raw device but must
			// not free the device info as it was inherited from the raw
			// device and is still in use there
			put_vnode(fs->volume, vnode->stream.u.dev.partition->raw_device->id);
		}
	} else if (S_ISDIR(vnode->stream.type)) {
		mutex_destroy(&vnode->stream.u.dir.scan_lock);
	}

	free(vnode->name);
	free(vnode);

	return B_OK;
}


/*! Makes sure none of the dircookies point to the vnode passed in */
static void
update_dir_cookies(struct devfs_vnode* dir, struct devfs_vnode* vnode)
{
	struct devfs_dir_cookie* cookie = NULL;

	while ((cookie = (devfs_dir_cookie*)list_get_next_item(
			&dir->stream.u.dir.cookies, cookie)) != NULL) {
		if (cookie->current == vnode)
			cookie->current = vnode->dir_next;
	}
}


static struct devfs_vnode*
devfs_find_in_dir(struct devfs_vnode* dir, const char* path)
{
	struct devfs_vnode* vnode;

	if (!S_ISDIR(dir->stream.type))
		return NULL;

	if (!strcmp(path, "."))
		return dir;
	if (!strcmp(path, ".."))
		return dir->parent;

	for (vnode = dir->stream.u.dir.dir_head; vnode; vnode = vnode->dir_next) {
		//TRACE(("devfs_find_in_dir: looking at entry '%s'\n", vnode->name));
		if (strcmp(vnode->name, path) == 0) {
			//TRACE(("devfs_find_in_dir: found it at %p\n", vnode));
			return vnode;
		}
	}
	return NULL;
}


static status_t
devfs_insert_in_dir(struct devfs_vnode* dir, struct devfs_vnode* vnode,
	bool notify = true)
{
	if (!S_ISDIR(dir->stream.type))
		return B_BAD_VALUE;

	// make sure the directory stays sorted alphabetically

	devfs_vnode* node = dir->stream.u.dir.dir_head;
	devfs_vnode* last = NULL;
	while (node && strcmp(node->name, vnode->name) < 0) {
		last = node;
		node = node->dir_next;
	}
	if (last == NULL) {
		// the new vnode is the first entry in the list
		vnode->dir_next = dir->stream.u.dir.dir_head;
		dir->stream.u.dir.dir_head = vnode;
	} else {
		// insert after that node
		vnode->dir_next = last->dir_next;
		last->dir_next = vnode;
	}

	vnode->parent = dir;
	dir->modification_time = current_timespec();

	if (notify) {
		notify_entry_created(sDeviceFileSystem->id, dir->id, vnode->name,
			vnode->id);
		notify_stat_changed(sDeviceFileSystem->id, get_parent_id(dir), dir->id,
			B_STAT_MODIFICATION_TIME);
	}
	return B_OK;
}


static status_t
devfs_remove_from_dir(struct devfs_vnode* dir, struct devfs_vnode* removeNode,
	bool notify = true)
{
	struct devfs_vnode* vnode = dir->stream.u.dir.dir_head;
	struct devfs_vnode* lastNode = NULL;

	for (; vnode != NULL; lastNode = vnode, vnode = vnode->dir_next) {
		if (vnode == removeNode) {
			// make sure no dircookies point to this vnode
			update_dir_cookies(dir, vnode);

			if (lastNode)
				lastNode->dir_next = vnode->dir_next;
			else
				dir->stream.u.dir.dir_head = vnode->dir_next;
			vnode->dir_next = NULL;
			dir->modification_time = current_timespec();

			if (notify) {
				notify_entry_removed(sDeviceFileSystem->id, dir->id, vnode->name,
					vnode->id);
				notify_stat_changed(sDeviceFileSystem->id, get_parent_id(dir),
					dir->id, B_STAT_MODIFICATION_TIME);
			}
			return B_OK;
		}
	}
	return B_ENTRY_NOT_FOUND;
}


static status_t
add_partition(struct devfs* fs, struct devfs_vnode* device, const char* name,
	const partition_info& info)
{
	struct devfs_vnode* partitionNode;
	status_t status;

	if (!S_ISCHR(device->stream.type))
		return B_BAD_VALUE;

	// we don't support nested partitions
	if (device->stream.u.dev.partition != NULL)
		return B_BAD_VALUE;

	// reduce checks to a minimum - things like negative offsets could be useful
	if (info.size < 0)
		return B_BAD_VALUE;

	// create partition
	struct devfs_partition* partition = (struct devfs_partition*)malloc(
		sizeof(struct devfs_partition));
	if (partition == NULL)
		return B_NO_MEMORY;

	memcpy(&partition->info, &info, sizeof(partition_info));

	RecursiveLocker locker(fs->lock);

	// you cannot change a partition once set
	if (devfs_find_in_dir(device->parent, name)) {
		status = B_BAD_VALUE;
		goto err1;
	}

	// increase reference count of raw device -
	// the partition device really needs it
	status = get_vnode(fs->volume, device->id, (void**)&partition->raw_device);
	if (status < B_OK)
		goto err1;

	// now create the partition vnode
	partitionNode = devfs_create_vnode(fs, device->parent, name);
	if (partitionNode == NULL) {
		status = B_NO_MEMORY;
		goto err2;
	}

	partitionNode->stream.type = device->stream.type;
	partitionNode->stream.u.dev.device = device->stream.u.dev.device;
	partitionNode->stream.u.dev.partition = partition;

	fs->vnode_hash->Insert(partitionNode);
	devfs_insert_in_dir(device->parent, partitionNode);

	TRACE(("add_partition(name = %s, offset = %" B_PRIdOFF
		", size = %" B_PRIdOFF ")\n",
		name, info.offset, info.size));
	return B_OK;

err2:
	put_vnode(fs->volume, device->id);
err1:
	free(partition);
	return status;
}


static inline void
translate_partition_access(devfs_partition* partition, off_t& offset,
	size_t& size)
{
	ASSERT(offset >= 0);
	ASSERT(offset < partition->info.size);

	size = (size_t)min_c((off_t)size, partition->info.size - offset);
	offset += partition->info.offset;
}


static bool
translate_partition_access(devfs_partition* partition, uint64& offset,
	uint64& size)
{
	const off_t partitionSize = partition->info.size;
	const off_t partitionOffset = partition->info.offset;

	// Check that off_t values can be cast to uint64,
	// partition offset can theoretically be negative
	ASSERT(partitionSize >= 0);
	STATIC_ASSERT(sizeof(partitionSize) <= sizeof(uint64));
	STATIC_ASSERT(sizeof(partitionOffset) <= sizeof(uint64));

	// Check that calculations give expected results
	if (offset >= (uint64)partitionSize)
		return false;
	if (partitionOffset >= 0 && offset > UINT64_MAX - (uint64)partitionOffset)
		return false;
	if (partitionOffset < 0 && offset < (uint64)-partitionOffset)
		return false;

	size = min_c(size, (uint64)partitionSize - offset);
	if (partitionOffset >= 0)
		offset += (uint64)partitionOffset;
	else
		offset -= (uint64)-partitionOffset;

	return true;
}


static inline void
translate_partition_access(devfs_partition* partition, io_request* request)
{
	off_t offset = request->Offset();

	ASSERT(offset >= 0);
	ASSERT(offset + (off_t)request->Length() <= partition->info.size);

	request->SetOffset(offset + partition->info.offset);
}


static status_t
get_node_for_path(struct devfs* fs, const char* path,
	struct devfs_vnode** _node)
{
	return vfs_get_fs_node_from_path(fs->volume, path, false, true,
		(void**)_node);
}


static status_t
unpublish_node(struct devfs* fs, devfs_vnode* node, mode_t type)
{
	if ((node->stream.type & S_IFMT) != type)
		return B_BAD_TYPE;

	recursive_lock_lock(&fs->lock);

	status_t status = devfs_remove_from_dir(node->parent, node);
	if (status < B_OK)
		goto out;

	status = remove_vnode(fs->volume, node->id);

out:
	recursive_lock_unlock(&fs->lock);
	return status;
}


static void
publish_node(devfs* fs, devfs_vnode* dirNode, struct devfs_vnode* node)
{
	fs->vnode_hash->Insert(node);
	devfs_insert_in_dir(dirNode, node);
}


static status_t
publish_directory(struct devfs* fs, const char* path)
{
	ASSERT_LOCKED_RECURSIVE(&fs->lock);

	// copy the path over to a temp buffer so we can munge it
	KPath tempPath(path);
	if (tempPath.InitCheck() != B_OK)
		return B_NO_MEMORY;

	TRACE(("devfs: publish directory \"%s\"\n", path));
	char* temp = tempPath.LockBuffer();

	// create the path leading to the device
	// parse the path passed in, stripping out '/'

	struct devfs_vnode* dir = fs->root_vnode;
	struct devfs_vnode* vnode = NULL;
	status_t status = B_OK;
	int32 i = 0, last = 0;

	while (temp[last]) {
		if (temp[i] == '/') {
			temp[i] = '\0';
			i++;
		} else if (temp[i] != '\0') {
			i++;
			continue;
		}

		//TRACE(("\tpath component '%s'\n", &temp[last]));

		// we have a path component
		vnode = devfs_find_in_dir(dir, &temp[last]);
		if (vnode) {
			if (S_ISDIR(vnode->stream.type)) {
				last = i;
				dir = vnode;
				continue;
			}

			// we hit something on our path that's not a directory
			status = B_FILE_EXISTS;
			goto out;
		} else {
			vnode = devfs_create_vnode(fs, dir, &temp[last]);
			if (!vnode) {
				status = B_NO_MEMORY;
				goto out;
			}
		}

		// set up the new directory
		init_directory_vnode(vnode, 0755);
		publish_node(sDeviceFileSystem, dir, vnode);

		last = i;
		dir = vnode;
	}

out:
	return status;
}


static status_t
new_node(struct devfs* fs, const char* path, struct devfs_vnode** _node,
	struct devfs_vnode** _dir)
{
	ASSERT_LOCKED_RECURSIVE(&fs->lock);

	// copy the path over to a temp buffer so we can munge it
	KPath tempPath(path);
	if (tempPath.InitCheck() != B_OK)
		return B_NO_MEMORY;

	char* temp = tempPath.LockBuffer();

	// create the path leading to the device
	// parse the path passed in, stripping out '/'

	struct devfs_vnode* dir = fs->root_vnode;
	struct devfs_vnode* vnode = NULL;
	status_t status = B_OK;
	int32 i = 0, last = 0;
	bool atLeaf = false;

	for (;;) {
		if (temp[i] == '\0') {
			atLeaf = true; // we'll be done after this one
		} else if (temp[i] == '/') {
			temp[i] = '\0';
			i++;
		} else {
			i++;
			continue;
		}

		//TRACE(("\tpath component '%s'\n", &temp[last]));

		// we have a path component
		vnode = devfs_find_in_dir(dir, &temp[last]);
		if (vnode) {
			if (!atLeaf) {
				// we are not at the leaf of the path, so as long as
				// this is a dir we're okay
				if (S_ISDIR(vnode->stream.type)) {
					last = i;
					dir = vnode;
					continue;
				}
			}
			// we are at the leaf and hit another node
			// or we aren't but hit a non-dir node.
			// we're screwed
			status = B_FILE_EXISTS;
			goto out;
		} else {
			vnode = devfs_create_vnode(fs, dir, &temp[last]);
			if (!vnode) {
				status = B_NO_MEMORY;
				goto out;
			}
		}

		// set up the new vnode
		if (!atLeaf) {
			// this is a dir
			init_directory_vnode(vnode, 0755);
			publish_node(fs, dir, vnode);
		} else {
			// this is the last component
			// Note: We do not yet insert the node into the directory, as it
			// is not yet fully initialized. Instead we return the directory
			// vnode so that the calling function can insert it after all
			// initialization is done. This ensures that no create notification
			// is sent out for a vnode that is not yet fully valid.
			*_node = vnode;
			*_dir = dir;
			break;
		}

		last = i;
		dir = vnode;
	}

out:
	return status;
}


static status_t
publish_device(struct devfs* fs, const char* path, BaseDevice* device)
{
	TRACE(("publish_device(path = \"%s\", device = %p)\n", path, device));

	if (sDeviceFileSystem == NULL) {
		panic("publish_device() called before devfs mounted\n");
		return B_ERROR;
	}

	if (device == NULL || path == NULL || path[0] == '\0' || path[0] == '/')
		return B_BAD_VALUE;

// TODO: this has to be done in the BaseDevice sub classes!
#if 0
	// are the provided device hooks okay?
	if (info->device_open == NULL || info->device_close == NULL
		|| info->device_free == NULL
		|| ((info->device_read == NULL || info->device_write == NULL)
			&& info->device_io == NULL))
		return B_BAD_VALUE;
#endif

	struct devfs_vnode* node;
	struct devfs_vnode* dirNode;
	status_t status;

	RecursiveLocker locker(&fs->lock);

	status = new_node(fs, path, &node, &dirNode);
	if (status != B_OK)
		return status;

	// all went fine, let's initialize the node
	node->stream.type = S_IFCHR | 0644;
	node->stream.u.dev.device = device;
	device->SetID(node->id);

	// the node is now fully valid and we may insert it into the dir
	publish_node(fs, dirNode, node);
	return B_OK;
}


/*!	Construct complete device name (as used for device_open()).
	This is safe to use only when the device is in use (and therefore
	cannot be unpublished during the iteration).
*/
static void
get_device_name(struct devfs_vnode* vnode, char* buffer, size_t size)
{
	RecursiveLocker _(sDeviceFileSystem->lock);

	struct devfs_vnode* leaf = vnode;
	size_t offset = 0;

	// count levels

	for (; vnode->parent && vnode->parent != vnode; vnode = vnode->parent) {
		offset += strlen(vnode->name) + 1;
	}

	// construct full path name

	for (vnode = leaf; vnode->parent && vnode->parent != vnode;
			vnode = vnode->parent) {
		size_t length = strlen(vnode->name);
		size_t start = offset - length - 1;

		if (size >= offset) {
			strcpy(buffer + start, vnode->name);
			if (vnode != leaf)
				buffer[offset - 1] = '/';
		}

		offset = start;
	}
}


static int
dump_node(int argc, char** argv)
{
	if (argc != 2) {
		print_debugger_command_usage(argv[0]);
		return 0;
	}

	struct devfs_vnode* vnode = (struct devfs_vnode*)parse_expression(argv[1]);
	if (vnode == NULL) {
		kprintf("invalid node address\n");
		return 0;
	}

	kprintf("DEVFS NODE: %p\n", vnode);
	kprintf(" id:          %" B_PRIdINO "\n", vnode->id);
	kprintf(" name:        \"%s\"\n", vnode->name);
	kprintf(" type:        %x\n", vnode->stream.type);
	kprintf(" parent:      %p\n", vnode->parent);
	kprintf(" dir next:    %p\n", vnode->dir_next);

	if (S_ISDIR(vnode->stream.type)) {
		kprintf(" dir scanned: %" B_PRId32 "\n", vnode->stream.u.dir.scanned);
		kprintf(" contents:\n");

		devfs_vnode* children = vnode->stream.u.dir.dir_head;
		while (children != NULL) {
			kprintf("   %p, id %" B_PRIdINO "\n", children, children->id);
			children = children->dir_next;
		}
	} else if (S_ISLNK(vnode->stream.type)) {
		kprintf(" symlink to:  %s\n", vnode->stream.u.symlink.path);
	} else {
		kprintf(" device:      %p\n", vnode->stream.u.dev.device);
		kprintf(" partition:   %p\n", vnode->stream.u.dev.partition);
		if (vnode->stream.u.dev.partition != NULL) {
			partition_info& info = vnode->stream.u.dev.partition->info;
			kprintf("  raw device node: %p\n",
				vnode->stream.u.dev.partition->raw_device);
			kprintf("  offset:          %" B_PRIdOFF "\n", info.offset);
			kprintf("  size:            %" B_PRIdOFF "\n", info.size);
			kprintf("  block size:      %" B_PRId32 "\n", info.logical_block_size);
			kprintf("  session:         %" B_PRId32 "\n", info.session);
			kprintf("  partition:       %" B_PRId32 "\n", info.partition);
			kprintf("  device:          %s\n", info.device);
			set_debug_variable("_raw",
				(addr_t)vnode->stream.u.dev.partition->raw_device);
		}
	}

	return 0;
}


static int
dump_cookie(int argc, char** argv)
{
	if (argc != 2) {
		print_debugger_command_usage(argv[0]);
		return 0;
	}

	uint64 address;
	if (!evaluate_debug_expression(argv[1], &address, false))
		return 0;

	struct devfs_cookie* cookie = (devfs_cookie*)(addr_t)address;

	kprintf("DEVFS COOKIE: %p\n", cookie);
	kprintf(" device_cookie: %p\n", cookie->device_cookie);

	return 0;
}


//	#pragma mark - file system interface


static status_t
devfs_mount(fs_volume* volume, const char* devfs, uint32 flags,
	const char* args, ino_t* _rootNodeID)
{
	struct devfs_vnode* vnode;
	struct devfs* fs;
	status_t err;

	TRACE(("devfs_mount: entry\n"));

	if (sDeviceFileSystem) {
		TRACE(("double mount of devfs attempted\n"));
		err = B_ERROR;
		goto err;
	}

	fs = (struct devfs*)malloc(sizeof(struct devfs));
	if (fs == NULL) {
		err = B_NO_MEMORY;
		goto err;
	}

	volume->private_volume = fs;
	volume->ops = &kVolumeOps;
	fs->volume = volume;
	fs->id = volume->id;
	fs->next_vnode_id = 0;

	recursive_lock_init(&fs->lock, "devfs lock");

	fs->vnode_hash = new(std::nothrow) NodeTable();
	if (fs->vnode_hash == NULL || fs->vnode_hash->Init(DEVFS_HASH_SIZE) != B_OK) {
		err = B_NO_MEMORY;
		goto err2;
	}

	// create a vnode
	vnode = devfs_create_vnode(fs, NULL, "");
	if (vnode == NULL) {
		err = B_NO_MEMORY;
		goto err3;
	}

	// set it up
	vnode->parent = vnode;

	// create a dir stream for it to hold
	init_directory_vnode(vnode, 0755);
	fs->root_vnode = vnode;

	fs->vnode_hash->Insert(vnode);
	publish_vnode(volume, vnode->id, vnode, &kVnodeOps, vnode->stream.type, 0);

	*_rootNodeID = vnode->id;
	sDeviceFileSystem = fs;
	return B_OK;

err3:
	delete fs->vnode_hash;
err2:
	recursive_lock_destroy(&fs->lock);
	free(fs);
err:
	return err;
}


static status_t
devfs_unmount(fs_volume* _volume)
{
	struct devfs* fs = (struct devfs*)_volume->private_volume;
	struct devfs_vnode* vnode;

	TRACE(("devfs_unmount: entry fs = %p\n", fs));

	recursive_lock_lock(&fs->lock);

	// release the reference to the root
	put_vnode(fs->volume, fs->root_vnode->id);

	// delete all of the vnodes
	NodeTable::Iterator i(fs->vnode_hash);
	while (i.HasNext()) {
		vnode = i.Next();
		devfs_delete_vnode(fs, vnode, true);
	}
	delete fs->vnode_hash;

	recursive_lock_destroy(&fs->lock);
	free(fs);

	return B_OK;
}


static status_t
devfs_sync(fs_volume* _volume)
{
	TRACE(("devfs_sync: entry\n"));

	return B_OK;
}


static status_t
devfs_lookup(fs_volume* _volume, fs_vnode* _dir, const char* name, ino_t* _id)
{
	struct devfs* fs = (struct devfs*)_volume->private_volume;
	struct devfs_vnode* dir = (struct devfs_vnode*)_dir->private_node;
	struct devfs_vnode* vnode;
	status_t status;

	TRACE(("devfs_lookup: entry dir %p, name '%s'\n", dir, name));

	if (!S_ISDIR(dir->stream.type))
		return B_NOT_A_DIRECTORY;

	// Make sure the directory contents are up to date
	scan_for_drivers_if_needed(dir);

	RecursiveLocker locker(&fs->lock);

	// look it up
	vnode = devfs_find_in_dir(dir, name);
	if (vnode == NULL) {
		// We don't have to rescan here, because thanks to node monitoring
		// we already know it does not exist
		return B_ENTRY_NOT_FOUND;
	}

	status = get_vnode(fs->volume, vnode->id, NULL);
	if (status < B_OK)
		return status;

	*_id = vnode->id;

	return B_OK;
}


static status_t
devfs_get_vnode_name(fs_volume* _volume, fs_vnode* _vnode, char* buffer,
	size_t bufferSize)
{
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;

	TRACE(("devfs_get_vnode_name: vnode = %p\n", vnode));

	strlcpy(buffer, vnode->name, bufferSize);
	return B_OK;
}


static status_t
devfs_get_vnode(fs_volume* _volume, ino_t id, fs_vnode* _vnode, int* _type,
	uint32* _flags, bool reenter)
{
	struct devfs* fs = (struct devfs*)_volume->private_volume;

	TRACE(("devfs_get_vnode: asking for vnode id = %" B_PRIdINO
		", vnode = %p, r %d\n", id, _vnode, reenter));

	RecursiveLocker _(fs->lock);

	struct devfs_vnode* vnode = fs->vnode_hash->Lookup(id);
	if (vnode == NULL)
		return B_ENTRY_NOT_FOUND;

	TRACE(("devfs_get_vnode: looked it up at %p\n", vnode));

	_vnode->private_node = vnode;
	_vnode->ops = &kVnodeOps;
	*_type = vnode->stream.type;
	*_flags = 0;
	return B_OK;
}


static status_t
devfs_put_vnode(fs_volume* _volume, fs_vnode* _vnode, bool reenter)
{
#ifdef TRACE_DEVFS
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;

	TRACE(("devfs_put_vnode: entry on vnode %p, id = %" B_PRIdINO
		", reenter %d\n", vnode, vnode->id, reenter));
#endif

	return B_OK;
}


static status_t
devfs_remove_vnode(fs_volume* _volume, fs_vnode* _v, bool reenter)
{
	struct devfs* fs = (struct devfs*)_volume->private_volume;
	struct devfs_vnode* vnode = (struct devfs_vnode*)_v->private_node;

	TRACE(("devfs_removevnode: remove %p (%" B_PRIdINO "), reenter %d\n",
		vnode, vnode->id, reenter));

	RecursiveLocker locker(&fs->lock);

	if (vnode->dir_next) {
		// can't remove node if it's linked to the dir
		panic("devfs_removevnode: vnode %p asked to be removed is present in dir\n", vnode);
	}

	devfs_delete_vnode(fs, vnode, false);

	return B_OK;
}


static status_t
devfs_open(fs_volume* _volume, fs_vnode* _vnode, int openMode,
	void** _cookie)
{
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;
	struct devfs_cookie* cookie;
	status_t status = B_OK;

	if (S_ISDIR(vnode->stream.type) && (openMode & O_RWMASK) != O_RDONLY)
		return B_IS_A_DIRECTORY;
	if ((openMode & O_DIRECTORY) != 0 && !S_ISDIR(vnode->stream.type))
		return B_NOT_A_DIRECTORY;

	cookie = (struct devfs_cookie*)malloc(sizeof(struct devfs_cookie));
	if (cookie == NULL)
		return B_NO_MEMORY;

	TRACE(("devfs_open: vnode %p, openMode 0x%x, cookie %p\n", vnode, openMode,
		cookie));

	cookie->device_cookie = NULL;

	if (S_ISCHR(vnode->stream.type)) {
		BaseDevice* device = vnode->stream.u.dev.device;
		status = device->InitDevice();
		if (status != B_OK) {
			free(cookie);
			return status;
		}

		char path[B_FILE_NAME_LENGTH];
		get_device_name(vnode, path, sizeof(path));

		status = device->Open(path, openMode, &cookie->device_cookie);
		if (status != B_OK)
			device->UninitDevice();
	}

	if (status != B_OK)
		free(cookie);
	else
		*_cookie = cookie;

	return status;
}


static status_t
devfs_close(fs_volume* _volume, fs_vnode* _vnode, void* _cookie)
{
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;
	struct devfs_cookie* cookie = (struct devfs_cookie*)_cookie;

	TRACE(("devfs_close: entry vnode %p, cookie %p\n", vnode, cookie));

	if (S_ISCHR(vnode->stream.type)) {
		// pass the call through to the underlying device
		return vnode->stream.u.dev.device->Close(cookie->device_cookie);
	}

	return B_OK;
}


static status_t
devfs_free_cookie(fs_volume* _volume, fs_vnode* _vnode, void* _cookie)
{
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;
	struct devfs_cookie* cookie = (struct devfs_cookie*)_cookie;

	TRACE(("devfs_freecookie: entry vnode %p, cookie %p\n", vnode, cookie));

	if (S_ISCHR(vnode->stream.type)) {
		// pass the call through to the underlying device
		vnode->stream.u.dev.device->Free(cookie->device_cookie);
		vnode->stream.u.dev.device->UninitDevice();
	}

	free(cookie);
	return B_OK;
}


static status_t
devfs_fsync(fs_volume* _volume, fs_vnode* _v, bool dataOnly)
{
	return B_OK;
}


static status_t
devfs_read_link(fs_volume* _volume, fs_vnode* _link, char* buffer,
	size_t* _bufferSize)
{
	struct devfs_vnode* link = (struct devfs_vnode*)_link->private_node;

	if (!S_ISLNK(link->stream.type))
		return B_BAD_VALUE;

	memcpy(buffer, link->stream.u.symlink.path, min_c(*_bufferSize,
		link->stream.u.symlink.length));

	*_bufferSize = link->stream.u.symlink.length;

	return B_OK;
}


static status_t
devfs_read(fs_volume* _volume, fs_vnode* _vnode, void* _cookie, off_t pos,
	void* buffer, size_t* _length)
{
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;
	struct devfs_cookie* cookie = (struct devfs_cookie*)_cookie;

	//TRACE(("devfs_read: vnode %p, cookie %p, pos %lld, len %p\n",
	//	vnode, cookie, pos, _length));

	if (!S_ISCHR(vnode->stream.type))
		return B_BAD_VALUE;

	if (pos < 0)
		return B_BAD_VALUE;

	if (vnode->stream.u.dev.partition != NULL) {
		if (pos >= vnode->stream.u.dev.partition->info.size)
			return B_BAD_VALUE;

		translate_partition_access(vnode->stream.u.dev.partition, pos,
			*_length);
	}

	if (*_length == 0)
		return B_OK;

	// pass the call through to the device
	return vnode->stream.u.dev.device->Read(cookie->device_cookie, pos, buffer,
		_length);
}


static status_t
devfs_write(fs_volume* _volume, fs_vnode* _vnode, void* _cookie, off_t pos,
	const void* buffer, size_t* _length)
{
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;
	struct devfs_cookie* cookie = (struct devfs_cookie*)_cookie;

	//TRACE(("devfs_write: vnode %p, cookie %p, pos %lld, len %p\n",
	//	vnode, cookie, pos, _length));

	if (!S_ISCHR(vnode->stream.type))
		return B_BAD_VALUE;

	if (pos < 0)
		return B_BAD_VALUE;

	if (vnode->stream.u.dev.partition != NULL) {
		if (pos >= vnode->stream.u.dev.partition->info.size)
			return B_BAD_VALUE;

		translate_partition_access(vnode->stream.u.dev.partition, pos,
			*_length);
	}

	if (*_length == 0)
		return B_OK;

	return vnode->stream.u.dev.device->Write(cookie->device_cookie, pos, buffer,
		_length);
}


static status_t
devfs_create_dir(fs_volume* _volume, fs_vnode* _dir, const char* name,
	int perms)
{
	struct devfs* fs = (struct devfs*)_volume->private_volume;
	struct devfs_vnode* dir = (struct devfs_vnode*)_dir->private_node;

	struct devfs_vnode* vnode = devfs_find_in_dir(dir, name);
	if (vnode != NULL) {
		return EEXIST;
	}

	vnode = devfs_create_vnode(fs, dir, name);
	if (vnode == NULL) {
		return B_NO_MEMORY;
	}

	// set up the new directory
	init_directory_vnode(vnode, perms);
	publish_node(sDeviceFileSystem, dir, vnode);

	return B_OK;
}


static status_t
devfs_open_dir(fs_volume* _volume, fs_vnode* _vnode, void** _cookie)
{
	struct devfs* fs = (struct devfs*)_volume->private_volume;
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;
	struct devfs_dir_cookie* cookie;

	TRACE(("devfs_open_dir: vnode %p\n", vnode));

	if (!S_ISDIR(vnode->stream.type))
		return B_BAD_VALUE;

	cookie = (devfs_dir_cookie*)malloc(sizeof(devfs_dir_cookie));
	if (cookie == NULL)
		return B_NO_MEMORY;

	// make sure the directory has up-to-date contents
	scan_for_drivers_if_needed(vnode);

	RecursiveLocker locker(&fs->lock);

	cookie->current = vnode->stream.u.dir.dir_head;
	cookie->state = ITERATION_STATE_BEGIN;

	list_add_item(&vnode->stream.u.dir.cookies, cookie);
	*_cookie = cookie;

	return B_OK;
}


static status_t
devfs_free_dir_cookie(fs_volume* _volume, fs_vnode* _vnode, void* _cookie)
{
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;
	struct devfs_dir_cookie* cookie = (devfs_dir_cookie*)_cookie;
	struct devfs* fs = (struct devfs*)_volume->private_volume;

	TRACE(("devfs_free_dir_cookie: entry vnode %p, cookie %p\n", vnode, cookie));

	RecursiveLocker locker(&fs->lock);

	list_remove_item(&vnode->stream.u.dir.cookies, cookie);
	free(cookie);
	return B_OK;
}


static status_t
devfs_read_dir(fs_volume* _volume, fs_vnode* _vnode, void* _cookie,
	struct dirent* dirent, size_t bufferSize, uint32* _num)
{
	struct devfs_vnode* vnode = (devfs_vnode*)_vnode->private_node;
	struct devfs_dir_cookie* cookie = (devfs_dir_cookie*)_cookie;
	struct devfs* fs = (struct devfs*)_volume->private_volume;
	status_t status = B_OK;
	struct devfs_vnode* childNode = NULL;
	const char* name = NULL;
	struct devfs_vnode* nextChildNode = NULL;
	int32 nextState = cookie->state;

	TRACE(("devfs_read_dir: vnode %p, cookie %p, buffer %p, size %ld\n",
		_vnode, cookie, dirent, bufferSize));

	if (!S_ISDIR(vnode->stream.type))
		return B_BAD_VALUE;

	RecursiveLocker locker(&fs->lock);

	switch (cookie->state) {
		case ITERATION_STATE_DOT:
			childNode = vnode;
			name = ".";
			nextChildNode = vnode->stream.u.dir.dir_head;
			nextState = cookie->state + 1;
			break;
		case ITERATION_STATE_DOT_DOT:
			childNode = vnode->parent;
			name = "..";
			nextChildNode = vnode->stream.u.dir.dir_head;
			nextState = cookie->state + 1;
			break;
		default:
			childNode = cookie->current;
			if (childNode) {
				name = childNode->name;
				nextChildNode = childNode->dir_next;
			}
			break;
	}

	if (!childNode) {
		*_num = 0;
		return B_OK;
	}

	dirent->d_dev = fs->id;
	dirent->d_ino = childNode->id;
	dirent->d_reclen = offsetof(struct dirent, d_name) + strlen(name) + 1;

	if (dirent->d_reclen > bufferSize)
		return ENOBUFS;

	status = user_strlcpy(dirent->d_name, name,
		bufferSize - offsetof(struct dirent, d_name));
	if (status < B_OK)
		return status;

	cookie->current = nextChildNode;
	cookie->state = nextState;
	*_num = 1;

	return B_OK;
}


static status_t
devfs_rewind_dir(fs_volume* _volume, fs_vnode* _vnode, void* _cookie)
{
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;
	struct devfs_dir_cookie* cookie = (devfs_dir_cookie*)_cookie;
	struct devfs* fs = (struct devfs*)_volume->private_volume;

	TRACE(("devfs_rewind_dir: vnode %p, cookie %p\n", vnode, cookie));

	if (!S_ISDIR(vnode->stream.type))
		return B_BAD_VALUE;

	RecursiveLocker locker(&fs->lock);

	cookie->current = vnode->stream.u.dir.dir_head;
	cookie->state = ITERATION_STATE_BEGIN;

	return B_OK;
}


/*!	Forwards the opcode to the device driver, but also handles some devfs
	specific functionality, like partitions.
*/
static status_t
devfs_ioctl(fs_volume* _volume, fs_vnode* _vnode, void* _cookie, uint32 op,
	void* buffer, size_t length)
{
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;
	struct devfs_cookie* cookie = (struct devfs_cookie*)_cookie;

	TRACE(("devfs_ioctl: vnode %p, cookie %p, op %" B_PRIu32
		", buf %p, len %" B_PRIuSIZE "\n",
		vnode, cookie, op, buffer, length));

	// we are actually checking for a *device* here, we don't make the
	// distinction between char and block devices
	if (S_ISCHR(vnode->stream.type)) {
		switch (op) {
			case B_GET_GEOMETRY:
			{
				struct devfs_partition* partition
					= vnode->stream.u.dev.partition;
				if (partition == NULL)
					break;

				device_geometry geometry;
				status_t status = vnode->stream.u.dev.device->Control(
					cookie->device_cookie, op, &geometry, length);
				if (status != B_OK)
					return status;

				// patch values to match partition size
				if (geometry.bytes_per_sector == 0)
					geometry.bytes_per_sector = 512;

				devfs_compute_geometry_size(&geometry,
					partition->info.size / geometry.bytes_per_sector,
					geometry.bytes_per_sector);

				return user_memcpy(buffer, &geometry, sizeof(device_geometry));
			}

			case B_TRIM_DEVICE:
			{
				struct devfs_partition* partition
					= vnode->stream.u.dev.partition;

				fs_trim_data* trimData;
				MemoryDeleter deleter;
				status_t status = get_trim_data_from_user(buffer, length,
					deleter, trimData);
				if (status != B_OK)
					return status;

#ifdef DEBUG_TRIM
				dprintf("TRIM: devfs: received TRIM ranges (bytes):\n");
				for (uint32 i = 0; i < trimData->range_count; i++) {
					dprintf("[%3" B_PRIu32 "] %" B_PRIu64 " : %"
						B_PRIu64 "\n", i,
						trimData->ranges[i].offset,
						trimData->ranges[i].size);
				}
#endif

				if (partition != NULL) {
					// If there is a partition, offset all ranges according
					// to the partition start.
					// Range size may be reduced to fit the partition size.
					for (uint32 i = 0; i < trimData->range_count; i++) {
						if (!translate_partition_access(partition,
							trimData->ranges[i].offset,
							trimData->ranges[i].size)) {
							return B_BAD_VALUE;
						}
					}

#ifdef DEBUG_TRIM
					dprintf("TRIM: devfs: TRIM ranges after partition"
						" translation (bytes):\n");
					for (uint32 i = 0; i < trimData->range_count; i++) {
						dprintf("[%3" B_PRIu32 "] %" B_PRIu64 " : %"
							B_PRIu64 "\n", i,
							trimData->ranges[i].offset,
							trimData->ranges[i].size);
					}
#endif
				}

				status = vnode->stream.u.dev.device->Control(
					cookie->device_cookie, op, trimData, length);

				// Copy the data back to userland (it contains the number of
				// trimmed bytes)
				if (status == B_OK)
					status = copy_trim_data_to_user(buffer, trimData);

				return status;
			}

			case B_GET_PARTITION_INFO:
			{
				struct devfs_partition* partition
					= vnode->stream.u.dev.partition;
				if (!S_ISCHR(vnode->stream.type)
					|| partition == NULL
					|| length != sizeof(partition_info))
					return B_BAD_VALUE;

				return user_memcpy(buffer, &partition->info,
					sizeof(partition_info));
			}

			case B_SET_PARTITION:
				return B_NOT_ALLOWED;

			case B_GET_PATH_FOR_DEVICE:
			{
				char path[256];
				// TODO: we might want to actually find the mountpoint
				// of that instance of devfs...
				// but for now we assume it's mounted on /dev
				strcpy(path, "/dev/");
				get_device_name(vnode, path + 5, sizeof(path) - 5);
				if (length && (length <= strlen(path)))
					return ERANGE;
				return user_strlcpy((char*)buffer, path, sizeof(path));
			}

			// old unsupported R5 private stuff

			case B_GET_NEXT_OPEN_DEVICE:
				dprintf("devfs: unsupported legacy ioctl B_GET_NEXT_OPEN_DEVICE\n");
				return B_UNSUPPORTED;
			case B_ADD_FIXED_DRIVER:
				dprintf("devfs: unsupported legacy ioctl B_ADD_FIXED_DRIVER\n");
				return B_UNSUPPORTED;
			case B_REMOVE_FIXED_DRIVER:
				dprintf("devfs: unsupported legacy ioctl B_REMOVE_FIXED_DRIVER\n");
				return B_UNSUPPORTED;

		}

		return vnode->stream.u.dev.device->Control(cookie->device_cookie,
			op, buffer, length);
	}

	return B_BAD_VALUE;
}


static status_t
devfs_set_flags(fs_volume* _volume, fs_vnode* _vnode, void* _cookie,
	int flags)
{
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;
	struct devfs_cookie* cookie = (struct devfs_cookie*)_cookie;

	// we need to pass the O_NONBLOCK flag to the underlying device

	if (!S_ISCHR(vnode->stream.type))
		return B_NOT_ALLOWED;

	return vnode->stream.u.dev.device->Control(cookie->device_cookie,
		flags & O_NONBLOCK ? B_SET_NONBLOCKING_IO : B_SET_BLOCKING_IO, NULL, 0);
}


static status_t
devfs_select(fs_volume* _volume, fs_vnode* _vnode, void* _cookie,
	uint8 event, selectsync* sync)
{
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;
	struct devfs_cookie* cookie = (struct devfs_cookie*)_cookie;

	if (!S_ISCHR(vnode->stream.type))
		return B_NOT_ALLOWED;

	// If the device has no select() hook, notify select() now.
	if (!vnode->stream.u.dev.device->HasSelect()) {
		if (!SELECT_TYPE_IS_OUTPUT_ONLY(event))
			notify_select_event((selectsync*)sync, event);
		return B_UNSUPPORTED;
	}

	return vnode->stream.u.dev.device->Select(cookie->device_cookie, event,
		(selectsync*)sync);
}


static status_t
devfs_deselect(fs_volume* _volume, fs_vnode* _vnode, void* _cookie,
	uint8 event, selectsync* sync)
{
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;
	struct devfs_cookie* cookie = (struct devfs_cookie*)_cookie;

	if (!S_ISCHR(vnode->stream.type))
		return B_NOT_ALLOWED;

	if (!vnode->stream.u.dev.device->HasDeselect())
		return B_OK;

	return vnode->stream.u.dev.device->Deselect(cookie->device_cookie, event,
		(selectsync*)sync);
}


static bool
devfs_can_page(fs_volume* _volume, fs_vnode* _vnode, void* cookie)
{
#if 0
	struct devfs_vnode* vnode = (devfs_vnode*)_vnode->private_node;

	//TRACE(("devfs_canpage: vnode %p\n", vnode));

	if (!S_ISCHR(vnode->stream.type)
		|| vnode->stream.u.dev.device->Node() == NULL
		|| cookie == NULL)
		return false;

	return vnode->stream.u.dev.device->HasRead()
		|| vnode->stream.u.dev.device->HasIO();
#endif
	// TODO: Obsolete hook!
	return false;
}


static status_t
devfs_read_pages(fs_volume* _volume, fs_vnode* _vnode, void* _cookie,
	off_t pos, const iovec* vecs, size_t count, size_t* _numBytes)
{
	struct devfs_vnode* vnode = (devfs_vnode*)_vnode->private_node;
	struct devfs_cookie* cookie = (struct devfs_cookie*)_cookie;

	//TRACE(("devfs_read_pages: vnode %p, vecs %p, count = %lu, pos = %lld, size = %lu\n", vnode, vecs, count, pos, *_numBytes));

	if (!S_ISCHR(vnode->stream.type)
		|| (!vnode->stream.u.dev.device->HasRead()
			&& !vnode->stream.u.dev.device->HasIO())
		|| cookie == NULL)
		return B_NOT_ALLOWED;

	if (pos < 0)
		return B_BAD_VALUE;

	if (vnode->stream.u.dev.partition != NULL) {
		if (pos >= vnode->stream.u.dev.partition->info.size)
			return B_BAD_VALUE;

		translate_partition_access(vnode->stream.u.dev.partition, pos,
			*_numBytes);
	}

	if (vnode->stream.u.dev.device->HasIO()) {
		// TODO: use io_requests for this!
	}

	// emulate read_pages() using read()

	status_t error = B_OK;
	size_t bytesTransferred = 0;

	size_t remainingBytes = *_numBytes;
	for (size_t i = 0; i < count && remainingBytes > 0; i++) {
		size_t toRead = min_c(vecs[i].iov_len, remainingBytes);
		size_t length = toRead;

		error = vnode->stream.u.dev.device->Read(cookie->device_cookie, pos,
			vecs[i].iov_base, &length);
		if (error != B_OK)
			break;

		pos += length;
		bytesTransferred += length;
		remainingBytes -= length;

		if (length < toRead)
			break;
	}

	*_numBytes = bytesTransferred;

	return bytesTransferred > 0 ? B_OK : error;
}


static status_t
devfs_write_pages(fs_volume* _volume, fs_vnode* _vnode, void* _cookie,
	off_t pos, const iovec* vecs, size_t count, size_t* _numBytes)
{
	struct devfs_vnode* vnode = (devfs_vnode*)_vnode->private_node;
	struct devfs_cookie* cookie = (struct devfs_cookie*)_cookie;

	//TRACE(("devfs_write_pages: vnode %p, vecs %p, count = %lu, pos = %lld, size = %lu\n", vnode, vecs, count, pos, *_numBytes));

	if (!S_ISCHR(vnode->stream.type)
		|| (!vnode->stream.u.dev.device->HasWrite()
			&& !vnode->stream.u.dev.device->HasIO())
		|| cookie == NULL)
		return B_NOT_ALLOWED;

	if (pos < 0)
		return B_BAD_VALUE;

	if (vnode->stream.u.dev.partition != NULL) {
		if (pos >= vnode->stream.u.dev.partition->info.size)
			return B_BAD_VALUE;

		translate_partition_access(vnode->stream.u.dev.partition, pos,
			*_numBytes);
	}

	if (vnode->stream.u.dev.device->HasIO()) {
		// TODO: use io_requests for this!
	}

	// emulate write_pages() using write()

	status_t error = B_OK;
	size_t bytesTransferred = 0;

	size_t remainingBytes = *_numBytes;
	for (size_t i = 0; i < count && remainingBytes > 0; i++) {
		size_t toWrite = min_c(vecs[i].iov_len, remainingBytes);
		size_t length = toWrite;

		error = vnode->stream.u.dev.device->Write(cookie->device_cookie, pos,
			vecs[i].iov_base, &length);
		if (error != B_OK)
			break;

		pos += length;
		bytesTransferred += length;
		remainingBytes -= length;

		if (length < toWrite)
			break;
	}

	*_numBytes = bytesTransferred;

	return bytesTransferred > 0 ? B_OK : error;
}


static status_t
devfs_io(fs_volume* volume, fs_vnode* _vnode, void* _cookie,
	io_request* request)
{
	TRACE(("[%d] devfs_io(request: %p)\n", find_thread(NULL), request));

	devfs_vnode* vnode = (devfs_vnode*)_vnode->private_node;
	devfs_cookie* cookie = (devfs_cookie*)_cookie;

	if (!S_ISCHR(vnode->stream.type) || cookie == NULL) {
		request->SetStatusAndNotify(B_NOT_ALLOWED);
		return B_NOT_ALLOWED;
	}

	if (!vnode->stream.u.dev.device->HasIO())
		return B_UNSUPPORTED;

	if (vnode->stream.u.dev.partition != NULL) {
		if (request->Offset() + (off_t)request->Length()
				> vnode->stream.u.dev.partition->info.size) {
			request->SetStatusAndNotify(B_BAD_VALUE);
			return B_BAD_VALUE;
		}
		translate_partition_access(vnode->stream.u.dev.partition, request);
	}

	return vnode->stream.u.dev.device->IO(cookie->device_cookie, request);
}


static status_t
devfs_read_stat(fs_volume* _volume, fs_vnode* _vnode, struct stat* stat)
{
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;

	TRACE(("devfs_read_stat: vnode %p (%" B_PRIdINO "), stat %p\n",
		vnode, vnode->id, stat));

	stat->st_ino = vnode->id;
	stat->st_rdev = vnode->id;
	stat->st_size = 0;
	stat->st_mode = vnode->stream.type;

	stat->st_nlink = 1;
	stat->st_blksize = 65536;
	stat->st_blocks = 0;

	stat->st_uid = vnode->uid;
	stat->st_gid = vnode->gid;

	stat->st_atim = current_timespec();
	stat->st_mtim = stat->st_ctim = vnode->modification_time;
	stat->st_crtim = vnode->creation_time;

	// TODO: this only works for partitions right now - if we should decide
	//	to keep this feature, we should have a better solution
	if (S_ISCHR(vnode->stream.type)) {
		//device_geometry geometry;

		// if it's a real block device, then let's report a useful size
		if (vnode->stream.u.dev.partition != NULL) {
			stat->st_size = vnode->stream.u.dev.partition->info.size;
#if 0
		} else if (vnode->stream.u.dev.info->control(cookie->device_cookie,
					B_GET_GEOMETRY, &geometry, sizeof(struct device_geometry)) >= B_OK) {
			stat->st_size = 1LL * geometry.head_count * geometry.cylinder_count
				* geometry.sectors_per_track * geometry.bytes_per_sector;
#endif
		}

		// is this a real block device? then let's have it reported like that
		if (stat->st_size != 0)
			stat->st_mode = S_IFBLK | (vnode->stream.type & S_IUMSK);
	} else if (S_ISLNK(vnode->stream.type)) {
		stat->st_size = vnode->stream.u.symlink.length;
	}

	return B_OK;
}


static status_t
devfs_write_stat(fs_volume* _volume, fs_vnode* _vnode, const struct stat* stat,
	uint32 statMask)
{
	struct devfs* fs = (struct devfs*)_volume->private_volume;
	struct devfs_vnode* vnode = (struct devfs_vnode*)_vnode->private_node;

	TRACE(("devfs_write_stat: vnode %p (0x%" B_PRIdINO "), stat %p\n",
		vnode, vnode->id, stat));

	// we cannot change the size of anything
	if (statMask & B_STAT_SIZE)
		return B_BAD_VALUE;

	RecursiveLocker locker(&fs->lock);

	if (statMask & B_STAT_MODE) {
		vnode->stream.type = (vnode->stream.type & ~S_IUMSK)
			| (stat->st_mode & S_IUMSK);
	}

	if (statMask & B_STAT_UID)
		vnode->uid = stat->st_uid;
	if (statMask & B_STAT_GID)
		vnode->gid = stat->st_gid;

	if (statMask & B_STAT_MODIFICATION_TIME)
		vnode->modification_time = stat->st_mtim;
	if (statMask & B_STAT_CREATION_TIME)
		vnode->creation_time = stat->st_crtim;

	notify_stat_changed(fs->id, get_parent_id(vnode), vnode->id, statMask);
	return B_OK;
}


static status_t
devfs_std_ops(int32 op, ...)
{
	switch (op) {
		case B_MODULE_INIT:
			add_debugger_command_etc("devfs_node", &dump_node,
				"Print info on a private devfs node",
				"<address>\n"
				"Prints information on a devfs node given by <address>.\n",
				0);
			add_debugger_command_etc("devfs_cookie", &dump_cookie,
				"Print info on a private devfs cookie",
				"<address>\n"
				"Prints information on a devfs cookie given by <address>.\n",
				0);

			legacy_driver_init();
			return B_OK;

		case B_MODULE_UNINIT:
			remove_debugger_command("devfs_node", &dump_node);
			remove_debugger_command("devfs_cookie", &dump_cookie);
			return B_OK;

		default:
			return B_ERROR;
	}
}

namespace {

fs_volume_ops kVolumeOps = {
	&devfs_unmount,
	NULL,
	NULL,
	&devfs_sync,
	&devfs_get_vnode,

	// the other operations are not supported (attributes, indices, queries)
	NULL,
};

fs_vnode_ops kVnodeOps = {
	&devfs_lookup,
	&devfs_get_vnode_name,

	&devfs_put_vnode,
	&devfs_remove_vnode,

	&devfs_can_page,
	&devfs_read_pages,
	&devfs_write_pages,

	&devfs_io,
	NULL,	// cancel_io()

	NULL,	// get_file_map

	/* common */
	&devfs_ioctl,
	&devfs_set_flags,
	&devfs_select,
	&devfs_deselect,
	&devfs_fsync,

	&devfs_read_link,
	NULL,	// symlink
	NULL,	// link
	NULL,	// unlink
	NULL,	// rename

	NULL,	// access
	&devfs_read_stat,
	&devfs_write_stat,
	NULL,

	/* file */
	NULL,	// create
	&devfs_open,
	&devfs_close,
	&devfs_free_cookie,
	&devfs_read,
	&devfs_write,

	/* directory */
	&devfs_create_dir,
	NULL,	// remove_dir
	&devfs_open_dir,
	&devfs_close,
		// same as for files - it does nothing for directories, anyway
	&devfs_free_dir_cookie,
	&devfs_read_dir,
	&devfs_rewind_dir,

	// attributes operations are not supported
	NULL,
};

}	// namespace

file_system_module_info gDeviceFileSystem = {
	{
		"file_systems/devfs" B_CURRENT_FS_API_VERSION,
		0,
		devfs_std_ops,
	},

	"devfs",					// short_name
	"Device File System",		// pretty_name
	0,							// DDM flags

	NULL,	// identify_partition()
	NULL,	// scan_partition()
	NULL,	// free_identify_partition_cookie()
	NULL,	// free_partition_content_cookie()

	&devfs_mount,
};


//	#pragma mark - kernel private API


extern "C" status_t
devfs_unpublish_file_device(const char* path)
{
	// get the device node
	devfs_vnode* node;
	status_t status = get_node_for_path(sDeviceFileSystem, path, &node);
	if (status != B_OK)
		return status;

	if (!S_ISCHR(node->stream.type)) {
		put_vnode(sDeviceFileSystem->volume, node->id);
		return B_BAD_VALUE;
	}

	// if it is indeed a file device, unpublish it
	FileDevice* device = dynamic_cast<FileDevice*>(node->stream.u.dev.device);
	if (device == NULL) {
		put_vnode(sDeviceFileSystem->volume, node->id);
		return B_BAD_VALUE;
	}

	status = unpublish_node(sDeviceFileSystem, node, S_IFCHR);

	put_vnode(sDeviceFileSystem->volume, node->id);
	return status;
}


extern "C" status_t
devfs_publish_file_device(const char* path, const char* filePath)
{
	// create a FileDevice for the file
	FileDevice* device = new(std::nothrow) FileDevice;
	if (device == NULL)
		return B_NO_MEMORY;
	ObjectDeleter<FileDevice> deviceDeleter(device);

	status_t error = device->Init(filePath);
	if (error != B_OK)
		return error;

	// publish the device
	error = publish_device(sDeviceFileSystem, path, device);
	if (error != B_OK)
		return error;

	deviceDeleter.Detach();
	return B_OK;
}


extern "C" status_t
devfs_unpublish_partition(const char* path)
{
	devfs_vnode* node;
	status_t status = get_node_for_path(sDeviceFileSystem, path, &node);
	if (status != B_OK)
		return status;

	status = unpublish_node(sDeviceFileSystem, node, S_IFCHR);
	put_vnode(sDeviceFileSystem->volume, node->id);
	return status;
}


extern "C" status_t
devfs_publish_partition(const char* name, const partition_info* info)
{
	if (name == NULL || info == NULL)
		return B_BAD_VALUE;
	TRACE(("publish partition: %s (device \"%s\", offset %" B_PRIdOFF
		", size %" B_PRIdOFF ")\n",
		name, info->device, info->offset, info->size));

	devfs_vnode* device;
	status_t status = get_node_for_path(sDeviceFileSystem, info->device,
		&device);
	if (status != B_OK)
		return status;

	status = add_partition(sDeviceFileSystem, device, name, *info);

	put_vnode(sDeviceFileSystem->volume, device->id);
	return status;
}


extern "C" status_t
devfs_rename_partition(const char* devicePath, const char* oldName,
	const char* newName)
{
	if (oldName == NULL || newName == NULL)
		return B_BAD_VALUE;

	devfs_vnode* device;
	status_t status = get_node_for_path(sDeviceFileSystem, devicePath, &device);
	if (status != B_OK)
		return status;

	RecursiveLocker locker(sDeviceFileSystem->lock);
	devfs_vnode* node = devfs_find_in_dir(device->parent, oldName);
	if (node == NULL)
		return B_ENTRY_NOT_FOUND;

	// check if the new path already exists
	if (devfs_find_in_dir(device->parent, newName))
		return B_BAD_VALUE;

	char* name = strdup(newName);
	if (name == NULL)
		return B_NO_MEMORY;

	devfs_remove_from_dir(device->parent, node, false);

	free(node->name);
	node->name = name;

	devfs_insert_in_dir(device->parent, node, false);

	notify_entry_moved(sDeviceFileSystem->id, device->parent->id, oldName,
		device->parent->id, newName, node->id);
	notify_stat_changed(sDeviceFileSystem->id, get_parent_id(device->parent),
		device->parent->id, B_STAT_MODIFICATION_TIME);

	return B_OK;
}


extern "C" status_t
devfs_publish_directory(const char* path)
{
	RecursiveLocker locker(&sDeviceFileSystem->lock);

	return publish_directory(sDeviceFileSystem, path);
}


extern "C" status_t
devfs_unpublish_device(const char* path, bool disconnect)
{
	devfs_vnode* node;
	status_t status = get_node_for_path(sDeviceFileSystem, path, &node);
	if (status != B_OK)
		return status;

	status = unpublish_node(sDeviceFileSystem, node, S_IFCHR);

	if (status == B_OK && disconnect)
		vfs_disconnect_vnode(sDeviceFileSystem->id, node->id);

	put_vnode(sDeviceFileSystem->volume, node->id);
	return status;
}


//	#pragma mark - device_manager private API


status_t
devfs_publish_device(const char* path, BaseDevice* device)
{
	return publish_device(sDeviceFileSystem, path, device);
}


status_t
devfs_unpublish_device(BaseDevice* device, bool disconnect)
{
	devfs_vnode* node;
	status_t status = get_vnode(sDeviceFileSystem->volume, device->ID(),
		(void**)&node);
	if (status != B_OK)
		return status;

	status = unpublish_node(sDeviceFileSystem, node, S_IFCHR);

	if (status == B_OK && disconnect)
		vfs_disconnect_vnode(sDeviceFileSystem->id, node->id);

	put_vnode(sDeviceFileSystem->volume, node->id);
	return status;
}


/*!	Gets the device for a given devfs relative path.
	If successful the call must be balanced with a call to devfs_put_device().
*/
status_t
devfs_get_device(const char* path, BaseDevice*& _device)
{
	devfs_vnode* node;
	status_t status = get_node_for_path(sDeviceFileSystem, path, &node);
	if (status != B_OK)
		return status;

	if (!S_ISCHR(node->stream.type) || node->stream.u.dev.partition != NULL) {
		put_vnode(sDeviceFileSystem->volume, node->id);
		return B_BAD_VALUE;
	}

	_device = node->stream.u.dev.device;
	return B_OK;
}


void
devfs_put_device(BaseDevice* device)
{
	put_vnode(sDeviceFileSystem->volume, device->ID());
}


void
devfs_compute_geometry_size(device_geometry* geometry, uint64 blockCount,
	uint32 blockSize)
{
	geometry->head_count = 1;
	while (blockCount > UINT32_MAX) {
		geometry->head_count <<= 1;
		blockCount >>= 1;
	}

	geometry->cylinder_count = 1;
	geometry->sectors_per_track = blockCount;
	geometry->bytes_per_sector = blockSize;
}


//	#pragma mark - support API for legacy drivers


extern "C" status_t
devfs_rescan_driver(const char* driverName)
{
	TRACE(("devfs_rescan_driver: %s\n", driverName));

	return legacy_driver_rescan(driverName);
}


extern "C" status_t
devfs_publish_device(const char* path, device_hooks* hooks)
{
	return legacy_driver_publish(path, hooks);
}