lguest: update commentry

Every so often, after code shuffles, I need to go through and unbitrot
the Lguest Journey (see drivers/lguest/README).  Since we now use RCU in
a simple form in one place I took the opportunity to expand that explanation.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index aa66a52..4516365 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -49,7 +49,7 @@
 #include "linux/virtio_ring.h"
 #include "asm/bootparam.h"
 /*L:110
- * We can ignore the 39 include files we need for this program, but I do want
+ * We can ignore the 42 include files we need for this program, but I do want
  * to draw attention to the use of kernel-style types.
  *
  * As Linus said, "C is a Spartan language, and so should your naming be."  I
@@ -305,6 +305,11 @@
 		    PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
 	if (addr == MAP_FAILED)
 		err(1, "Mmaping %u pages of /dev/zero", num);
+
+	/*
+	 * One neat mmap feature is that you can close the fd, and it
+	 * stays mapped.
+	 */
 	close(fd);
 
 	return addr;
@@ -557,7 +562,7 @@
 }
 /*:*/
 
-/*
+/*L:200
  * Device Handling.
  *
  * When the Guest gives us a buffer, it sends an array of addresses and sizes.
@@ -608,7 +613,10 @@
 	return next;
 }
 
-/* This actually sends the interrupt for this virtqueue */
+/*
+ * This actually sends the interrupt for this virtqueue, if we've used a
+ * buffer.
+ */
 static void trigger_irq(struct virtqueue *vq)
 {
 	unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
@@ -629,12 +637,12 @@
 }
 
 /*
- * This looks in the virtqueue and for the first available buffer, and converts
+ * This looks in the virtqueue for the first available buffer, and converts
  * it to an iovec for convenient access.  Since descriptors consist of some
  * number of output then some number of input descriptors, it's actually two
  * iovecs, but we pack them into one and note how many of each there were.
  *
- * This function returns the descriptor number found.
+ * This function waits if necessary, and returns the descriptor number found.
  */
 static unsigned wait_for_vq_desc(struct virtqueue *vq,
 				 struct iovec iov[],
@@ -644,10 +652,14 @@
 	struct vring_desc *desc;
 	u16 last_avail = lg_last_avail(vq);
 
+	/* There's nothing available? */
 	while (last_avail == vq->vring.avail->idx) {
 		u64 event;
 
-		/* OK, tell Guest about progress up to now. */
+		/*
+		 * Since we're about to sleep, now is a good time to tell the
+		 * Guest about what we've used up to now.
+		 */
 		trigger_irq(vq);
 
 		/* OK, now we need to know about added descriptors. */
@@ -734,8 +746,9 @@
 }
 
 /*
- * After we've used one of their buffers, we tell them about it.  We'll then
- * want to send them an interrupt, using trigger_irq().
+ * After we've used one of their buffers, we tell the Guest about it.  Sometime
+ * later we'll want to send them an interrupt using trigger_irq(); note that
+ * wait_for_vq_desc() does that for us if it has to wait.
  */
 static void add_used(struct virtqueue *vq, unsigned int head, int len)
 {
@@ -782,12 +795,12 @@
 	struct console_abort *abort = vq->dev->priv;
 	struct iovec iov[vq->vring.num];
 
-	/* Make sure there's a descriptor waiting. */
+	/* Make sure there's a descriptor available. */
 	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
 	if (out_num)
 		errx(1, "Output buffers in console in queue?");
 
-	/* Read it in. */
+	/* Read into it.  This is where we usually wait. */
 	len = readv(STDIN_FILENO, iov, in_num);
 	if (len <= 0) {
 		/* Ran out of input? */
@@ -800,6 +813,7 @@
 			pause();
 	}
 
+	/* Tell the Guest we used a buffer. */
 	add_used_and_trigger(vq, head, len);
 
 	/*
@@ -834,15 +848,23 @@
 	unsigned int head, out, in;
 	struct iovec iov[vq->vring.num];
 
+	/* We usually wait in here, for the Guest to give us something. */
 	head = wait_for_vq_desc(vq, iov, &out, &in);
 	if (in)
 		errx(1, "Input buffers in console output queue?");
+
+	/* writev can return a partial write, so we loop here. */
 	while (!iov_empty(iov, out)) {
 		int len = writev(STDOUT_FILENO, iov, out);
 		if (len <= 0)
 			err(1, "Write to stdout gave %i", len);
 		iov_consume(iov, out, len);
 	}
+
+	/*
+	 * We're finished with that buffer: if we're going to sleep,
+	 * wait_for_vq_desc() will prod the Guest with an interrupt.
+	 */
 	add_used(vq, head, 0);
 }
 
@@ -862,15 +884,30 @@
 	unsigned int head, out, in;
 	struct iovec iov[vq->vring.num];
 
+	/* We usually wait in here for the Guest to give us a packet. */
 	head = wait_for_vq_desc(vq, iov, &out, &in);
 	if (in)
 		errx(1, "Input buffers in net output queue?");
+	/*
+	 * Send the whole thing through to /dev/net/tun.  It expects the exact
+	 * same format: what a coincidence!
+	 */
 	if (writev(net_info->tunfd, iov, out) < 0)
 		errx(1, "Write to tun failed?");
+
+	/*
+	 * Done with that one; wait_for_vq_desc() will send the interrupt if
+	 * all packets are processed.
+	 */
 	add_used(vq, head, 0);
 }
 
-/* Will reading from this file descriptor block? */
+/*
+ * Handling network input is a bit trickier, because I've tried to optimize it.
+ *
+ * First we have a helper routine which tells is if from this file descriptor
+ * (ie. the /dev/net/tun device) will block:
+ */
 static bool will_block(int fd)
 {
 	fd_set fdset;
@@ -880,7 +917,11 @@
 	return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
 }
 
-/* This handles packets coming in from the tun device to our Guest. */
+/*
+ * This handles packets coming in from the tun device to our Guest.  Like all
+ * service routines, it gets called again as soon as it returns, so you don't
+ * see a while(1) loop here.
+ */
 static void net_input(struct virtqueue *vq)
 {
 	int len;
@@ -888,21 +929,38 @@
 	struct iovec iov[vq->vring.num];
 	struct net_info *net_info = vq->dev->priv;
 
+	/*
+	 * Get a descriptor to write an incoming packet into.  This will also
+	 * send an interrupt if they're out of descriptors.
+	 */
 	head = wait_for_vq_desc(vq, iov, &out, &in);
 	if (out)
 		errx(1, "Output buffers in net input queue?");
 
-	/* Deliver interrupt now, since we're about to sleep. */
+	/*
+	 * If it looks like we'll block reading from the tun device, send them
+	 * an interrupt.
+	 */
 	if (vq->pending_used && will_block(net_info->tunfd))
 		trigger_irq(vq);
 
+	/*
+	 * Read in the packet.  This is where we normally wait (when there's no
+	 * incoming network traffic).
+	 */
 	len = readv(net_info->tunfd, iov, in);
 	if (len <= 0)
 		err(1, "Failed to read from tun.");
+
+	/*
+	 * Mark that packet buffer as used, but don't interrupt here.  We want
+	 * to wait until we've done as much work as we can.
+	 */
 	add_used(vq, head, len);
 }
+/*:*/
 
-/* This is the helper to create threads. */
+/* This is the helper to create threads: run the service routine in a loop. */
 static int do_thread(void *_vq)
 {
 	struct virtqueue *vq = _vq;
@@ -950,11 +1008,14 @@
 	signal(SIGCHLD, (void *)kill_launcher);
 }
 
+/*L:216
+ * This actually creates the thread which services the virtqueue for a device.
+ */
 static void create_thread(struct virtqueue *vq)
 {
 	/*
-	 * Create stack for thread and run it.  Since the stack grows upwards,
-	 * we point the stack pointer to the end of this region.
+	 * Create stack for thread.  Since the stack grows upwards, we point
+	 * the stack pointer to the end of this region.
 	 */
 	char *stack = malloc(32768);
 	unsigned long args[] = { LHREQ_EVENTFD,
@@ -966,17 +1027,22 @@
 		err(1, "Creating eventfd");
 	args[2] = vq->eventfd;
 
-	/* Attach an eventfd to this virtqueue: it will go off
-	 * when the Guest does an LHCALL_NOTIFY for this vq. */
+	/*
+	 * Attach an eventfd to this virtqueue: it will go off when the Guest
+	 * does an LHCALL_NOTIFY for this vq.
+	 */
 	if (write(lguest_fd, &args, sizeof(args)) != 0)
 		err(1, "Attaching eventfd");
 
-	/* CLONE_VM: because it has to access the Guest memory, and
-	 * SIGCHLD so we get a signal if it dies. */
+	/*
+	 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
+	 * we get a signal if it dies.
+	 */
 	vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
 	if (vq->thread == (pid_t)-1)
 		err(1, "Creating clone");
-	/* We close our local copy, now the child has it. */
+
+	/* We close our local copy now the child has it. */
 	close(vq->eventfd);
 }
 
@@ -1028,7 +1094,10 @@
 	}
 }
 
-/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
+/*L:215
+ * This is the generic routine we call when the Guest uses LHCALL_NOTIFY.  In
+ * particular, it's used to notify us of device status changes during boot.
+ */
 static void handle_output(unsigned long addr)
 {
 	struct device *i;
@@ -1037,18 +1106,32 @@
 	for (i = devices.dev; i; i = i->next) {
 		struct virtqueue *vq;
 
-		/* Notifications to device descriptors update device status. */
+		/*
+		 * Notifications to device descriptors mean they updated the
+		 * device status.
+		 */
 		if (from_guest_phys(addr) == i->desc) {
 			update_device_status(i);
 			return;
 		}
 
-		/* Devices *can* be used before status is set to DRIVER_OK. */
+		/*
+		 * Devices *can* be used before status is set to DRIVER_OK.
+		 * The original plan was that they would never do this: they
+		 * would always finish setting up their status bits before
+		 * actually touching the virtqueues.  In practice, we allowed
+		 * them to, and they do (eg. the disk probes for partition
+		 * tables as part of initialization).
+		 *
+		 * If we see this, we start the device: once it's running, we
+		 * expect the device to catch all the notifications.
+		 */
 		for (vq = i->vq; vq; vq = vq->next) {
 			if (addr != vq->config.pfn*getpagesize())
 				continue;
 			if (i->running)
 				errx(1, "Notification on running %s", i->name);
+			/* This just calls create_thread() for each virtqueue */
 			start_device(i);
 			return;
 		}
@@ -1132,6 +1215,11 @@
 	vq->next = NULL;
 	vq->last_avail_idx = 0;
 	vq->dev = dev;
+
+	/*
+	 * This is the routine the service thread will run, and its Process ID
+	 * once it's running.
+	 */
 	vq->service = service;
 	vq->thread = (pid_t)-1;
 
@@ -1202,7 +1290,8 @@
 
 /*
  * This routine does all the creation and setup of a new device, including
- * calling new_dev_desc() to allocate the descriptor and device memory.
+ * calling new_dev_desc() to allocate the descriptor and device memory.  We
+ * don't actually start the service threads until later.
  *
  * See what I mean about userspace being boring?
  */
@@ -1478,19 +1567,7 @@
 		verbose("device %u: tun %s: %s\n",
 			devices.device_num, tapif, arg);
 }
-
-/*
- * Our block (disk) device should be really simple: the Guest asks for a block
- * number and we read or write that position in the file.  Unfortunately, that
- * was amazingly slow: the Guest waits until the read is finished before
- * running anything else, even if it could have been doing useful work.
- *
- * We could use async I/O, except it's reputed to suck so hard that characters
- * actually go missing from your code when you try to use it.
- *
- * So this was one reason why lguest now does all virtqueue servicing in
- * separate threads: it's more efficient and more like a real device.
- */
+/*:*/
 
 /* This hangs off device->priv. */
 struct vblk_info
@@ -1512,8 +1589,16 @@
 /*L:210
  * The Disk
  *
- * Remember that the block device is handled by a separate I/O thread.  We head
- * straight into the core of that thread here:
+ * The disk only has one virtqueue, so it only has one thread.  It is really
+ * simple: the Guest asks for a block number and we read or write that position
+ * in the file.
+ *
+ * Before we serviced each virtqueue in a separate thread, that was unacceptably
+ * slow: the Guest waits until the read is finished before running anything
+ * else, even if it could have been doing useful work.
+ *
+ * We could have used async I/O, except it's reputed to suck so hard that
+ * characters actually go missing from your code when you try to use it.
  */
 static void blk_request(struct virtqueue *vq)
 {
@@ -1525,7 +1610,10 @@
 	struct iovec iov[vq->vring.num];
 	off64_t off;
 
-	/* Get the next request. */
+	/*
+	 * Get the next request, where we normally wait.  It triggers the
+	 * interrupt to acknowledge previously serviced requests (if any).
+	 */
 	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
 
 	/*
@@ -1539,6 +1627,10 @@
 
 	out = convert(&iov[0], struct virtio_blk_outhdr);
 	in = convert(&iov[out_num+in_num-1], u8);
+	/*
+	 * For historical reasons, block operations are expressed in 512 byte
+	 * "sectors".
+	 */
 	off = out->sector * 512;
 
 	/*
@@ -1614,6 +1706,7 @@
 	if (out->type & VIRTIO_BLK_T_BARRIER)
 		fdatasync(vblk->fd);
 
+	/* Finished that request. */
 	add_used(vq, head, wlen);
 }
 
@@ -1682,9 +1775,8 @@
 		errx(1, "Output buffers in rng?");
 
 	/*
-	 * This is why we convert to iovecs: the readv() call uses them, and so
-	 * it reads straight into the Guest's buffer.  We loop to make sure we
-	 * fill it.
+	 * Just like the console write, we loop to cover the whole iovec.
+	 * In this case, short reads actually happen quite a bit.
 	 */
 	while (!iov_empty(iov, in_num)) {
 		len = readv(rng_info->rfd, iov, in_num);
@@ -1818,7 +1910,9 @@
 	devices.lastdev = NULL;
 	devices.next_irq = 1;
 
+	/* We're CPU 0.  In fact, that's the only CPU possible right now. */
 	cpu_id = 0;
+
 	/*
 	 * We need to know how much memory so we can set up the device
 	 * descriptor and memory pages for the devices as we parse the command
@@ -1926,7 +2020,7 @@
 	 */
 	tell_kernel(start);
 
-	/* Ensure that we terminate if a child dies. */
+	/* Ensure that we terminate if a device-servicing child dies. */
 	signal(SIGCHLD, kill_launcher);
 
 	/* If we exit via err(), this kills all the threads, restores tty. */