[trace-engine] Fix bug writing arguments within inline names.

Change-Id: I5cbaa3cec12c0423546e3e6db76d8bea9257cfea
[virtio][entropy] Basic virtio-rng driver
2017-09-12 17:36:11 +00:00 · 2017-09-12 17:14:47 +00:00 · 2017-09-12 17:09:50 +00:00 · 2017-09-12 08:22:07 +00:00 · 2017-09-11 23:29:35 -07:00 · 2017-09-11 23:29:35 -07:00
@@ -8,10 +8,6 @@ env:
    - PROJECT=magenta-hikey960-arm64
    - PROJECT=magenta-odroidc2-arm64

-    # kernel only, no-magenta builds
-    - PROJECT=pc-x86-64-test
-    - PROJECT=qemu-virt-a53-test
-
 sudo: required
 dist: trusty

@@ -20,11 +20,10 @@ changes, and locking (via futexes).
 Currently there are some temporary syscalls that have been used for early
 bringup work, which will be going away in the future as the long term
 syscall API/ABI surface is finalized.  The expectation is that there will
-be 10s, not 100s of syscalls.
+be about 100 syscalls.

-Magenta syscalls are generally non-blocking.  The wait (one, many, set)
-family of syscalls, ioport reads, and thread sleep being the notable
-exceptions.
+Magenta syscalls are generally non-blocking.  The wait_one, wait_many
+port_wait and thread sleep being the notable exceptions.

 This page is a non-comprehensive index of the magenta documentation.

@@ -33,9 +32,10 @@ This page is a non-comprehensive index of the magenta documentation.
 + [Testing](docs/testing.md)
 + [Hacking notes](docs/hacking.md)
 + [Memory usage analysis tools](docs/memory.md)
-+ [Relationship with LK](docs/mg_and_lk.md)
+ [Relationship with LK](docs/mx_and_lk.md)
 + [Kernel Objects](docs/objects.md)
 + [Process Objects](docs/objects/process.md)
 + [Thread Objects](docs/objects/thread.md)
 + [Handles](docs/handles.md)
 + [System Calls](docs/syscalls.md)
+ [Micro-benchmarks](docs/benchmarks/microbenchmarks.md)
@@ -46,7 +46,6 @@ EFI_SOURCES := \
    $(LOCAL_DIR)/src/osboot.c \
    $(LOCAL_DIR)/src/cmdline.c \
    $(LOCAL_DIR)/src/magenta.c \
-    $(LOCAL_DIR)/src/legacy.c \
    $(LOCAL_DIR)/src/misc.c \
    $(LOCAL_DIR)/src/netboot.c \
    $(LOCAL_DIR)/src/netifc.c \
@@ -65,6 +64,7 @@ EFI_SOURCES += \
    $(LOCAL_DIR)/lib/printf.c \
    $(LOCAL_DIR)/lib/stdlib.c \
    $(LOCAL_DIR)/lib/string.c \
+    $(LOCAL_DIR)/lib/strings.c \
    $(LOCAL_DIR)/lib/inet.c \

 EFI_OBJS := $(patsubst $(LOCAL_DIR)/%.c,$(BUILDDIR)/bootloader/%.o,$(EFI_SOURCES))
@@ -1,118 +0,0 @@
-// Copyright 2016 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#pragma once
-
-#ifndef __ASSEMBLY__
-
-#if __GNUC__ || defined(__clang__)
-#define likely(x)       __builtin_expect(!!(x), 1)
-#define unlikely(x)     __builtin_expect(!!(x), 0)
-#define __UNUSED __attribute__((__unused__))
-#define __PACKED __attribute__((packed))
-#define __ALIGNED(x) __attribute__((aligned(x)))
-#define __PRINTFLIKE(__fmt,__varargs) __attribute__((__format__ (__printf__, __fmt, __varargs)))
-#define __SCANFLIKE(__fmt,__varargs) __attribute__((__format__ (__scanf__, __fmt, __varargs)))
-#define __SECTION(x) __attribute((section(x)))
-#define __PURE __attribute((pure))
-#define __CONST __attribute((const))
-#define __NO_RETURN __attribute__((noreturn))
-#define __MALLOC __attribute__((malloc))
-#define __WEAK __attribute__((weak))
-#define __GNU_INLINE __attribute__((gnu_inline))
-#define __GET_CALLER(x) __builtin_return_address(0)
-#define __GET_FRAME(x) __builtin_frame_address(0)
-#define __NAKED __attribute__((naked))
-#define __ISCONSTANT(x) __builtin_constant_p(x)
-#define __NO_INLINE __attribute((noinline))
-#define __SRAM __NO_INLINE __SECTION(".sram.text")
-#define __CONSTRUCTOR __attribute__((constructor))
-#define __DESTRUCTOR __attribute__((destructor))
-
-#ifndef __clang__
-#define __OPTIMIZE(x) __attribute__((optimize(x)))
-#define __EXTERNALLY_VISIBLE __attribute__((externally_visible))
-#else
-#define __OPTIMIZE(x)
-#define __EXTERNALLY_VISIBLE
-#endif
-
-#define __ALWAYS_INLINE __attribute__((always_inline))
-#define __MAY_ALIAS __attribute__((may_alias))
-#define __NONNULL(x) __attribute((nonnull x))
-#define __WARN_UNUSED_RESULT __attribute((warn_unused_result))
-#define __UNREACHABLE __builtin_unreachable()
-#define __WEAK_ALIAS(x) __attribute__((weak, alias(x)))
-#define __ALIAS(x) __attribute__((alias(x)))
-#define __EXPORT __attribute__ ((visibility("default")))
-#define __LOCAL  __attribute__ ((visibility("hidden")))
-#define __THREAD __thread
-#define __offsetof(type, field) __builtin_offsetof(type, field)
-
-#if !defined __DEPRECATED
-#define __DEPRECATED __attribute((deprecated))
-#endif
-
-/* compiler fence */
-#define CF do { __asm__ volatile("" ::: "memory"); } while(0)
-
-#else  // if __GNUC__ || defined(__clang__)
-
-#warning "Unrecognized compiler!  Please update global/include/compiler.h"
-
-#define likely(x)
-#define unlikely(x)
-#define __UNUSED
-#define __PACKED
-#define __ALIGNED(x)
-#define __PRINTFLIKE(__fmt,__varargs)
-#define __SCANFLIKE(__fmt,__varargs)
-#define __SECTION(x)
-#define __PURE
-#define __CONST
-#define __NO_RETURN
-#define __MALLOC
-#define __WEAK
-#define __GNU_INLINE
-#define __GET_CALLER(x)
-#define __GET_FRAME(x)
-#define __NAKED
-#define __ISCONSTANT(x)
-#define __NO_INLINE
-#define __SRAM
-#define __CONSTRUCTOR
-#define __DESTRUCTOR
-#define __OPTIMIZE(x)
-#define __ALWAYS_INLINE
-#define __MAY_ALIAS
-#define __NONNULL(x)
-#define __WARN_UNUSED_RESULT
-#define __EXTERNALLY_VISIBLE
-#define __UNREACHABLE
-#define __WEAK_ALIAS(x)
-#define __ALIAS(x)
-#define __EXPORT
-#define __LOCAL
-#define __THREAD
-
-#if !defined __DEPRECATED
-#define __DEPRECATED
-#endif
-
-#define CF
-
-#endif  // if __GNUC__ || defined(__clang__)
-#endif  // ifndef __ASSEMBLY__
-
-/* TODO: add type check */
-#define countof(a) (sizeof(a) / sizeof((a)[0]))
-
-/* CPP header guards */
-#ifdef __cplusplus
-#define __BEGIN_CDECLS  extern "C" {
-#define __END_CDECLS    }
-#else
-#define __BEGIN_CDECLS
-#define __END_CDECLS
-#endif
@@ -6,3 +6,4 @@

 int isdigit(int c);
 int isspace(int c);
+int tolower(int c);
@@ -6,6 +6,18 @@

 #include <stdint.h>

+#define PRId8 "d"
+#define PRId16 "d"
+#define PRId32 "d"
+
+#define PRIu8 "u"
+#define PRIu16 "u"
+#define PRIu32 "u"
+
+#define PRIx8 "x"
+#define PRIx16 "x"
+#define PRIx32 "x"
+
 #ifdef __clang__
 #define PRIx64 "llx"
 #else
@@ -9,9 +9,10 @@
 #define __LIB_PRINTF_H

 #include <stdarg.h>
-#include <compiler.h>
 #include <stdint.h>

+#include <magenta/compiler.h>
+
 __BEGIN_CDECLS

 #if !DISABLE_DEBUG_OUTPUT
@@ -0,0 +1,10 @@
+// Copyright 2017 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#pragma once
+
+#include <stdint.h>
+
+int strcasecmp(const char* s1, const char* s2);
+int strncasecmp(const char* s1, const char* s2, size_t len);
@@ -16,3 +16,11 @@ int isspace(int c) {
           (c == '\t') ||
           (c == '\v');
 }
+
+int tolower(int c) {
+    if (c >= 'A' && c <= 'Z') {
+        return c + ('a' - 'A');
+    }
+    return c;
+}
+
@@ -0,0 +1,29 @@
+// Copyright 2017 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <ctype.h>
+#include <strings.h>
+
+int strcasecmp(const char* s1, const char* s2) {
+    while (1) {
+        int diff = tolower(*s1) - tolower(*s2);
+        if (diff != 0 || *s1 == '\0') {
+            return diff;
+        }
+        s1++;
+        s2++;
+    }
+}
+
+int strncasecmp(const char* s1, const char* s2, size_t len) {
+    while (len-- > 0) {
+        int diff = tolower(*s1) - tolower(*s2);
+        if (diff != 0 || *s1 == '\0') {
+            return diff;
+        }
+        s1++;
+        s2++;
+    }
+    return 0;
+}
@@ -183,8 +183,8 @@ void draw_logo() {
             logo_width, logo_height, 0);
 }

-#include "font-1x.h"
-#include "font-2x.h"
+#include <magenta/font/font-9x16.h>
+#include <magenta/font/font-18x32.h>

 static void putchar(efi_graphics_output_protocol* gop, fb_font* font, unsigned ch, unsigned x, unsigned y, unsigned scale_x, unsigned scale_y, efi_graphics_output_blt_pixel* fg, efi_graphics_output_blt_pixel* bg) {
    const uint16_t* cdata = font->data + ch * font->height;
@@ -225,9 +225,9 @@ void draw_nodename(const char* nodename) {
        return;

    fb_font font = {
-        .data = FONT2X,
-        .width = FONT2X_WIDTH,
-        .height = FONT2X_HEIGHT,
+        .data = FONT18X32,
+        .width = FONT18X32_WIDTH,
+        .height = FONT18X32_HEIGHT,
        .color = &font_white,
    };

@@ -247,9 +247,9 @@ void draw_version(const char* version) {
    size_t version_len = strlen(version);

    fb_font font = {
-        .data = FONT1X,
-        .width = FONT1X_WIDTH,
-        .height = FONT1X_HEIGHT,
+        .data = FONT9X16,
+        .width = FONT9X16_WIDTH,
+        .height = FONT9X16_HEIGHT,
        .color = &font_fuchsia,
    };

@@ -1,340 +0,0 @@
-// Copyright 2017 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <string.h>
-#include <xefi.h>
-
-#include "osboot.h"
-
-#include <magenta/boot/bootdata.h>
-
-typedef struct {
-    uint8_t* zeropage;
-    uint8_t* cmdline;
-    void* image;
-    uint32_t pages;
-} kernel_t;
-
-#define E820_IGNORE 0
-#define E820_RAM 1
-#define E820_RESERVED 2
-#define E820_ACPI 3
-#define E820_NVS 4
-#define E820_UNUSABLE 5
-
-static inline const char* e820name(int e820) {
-    switch (e820) {
-    case E820_IGNORE:   return "IGNORE";
-    case E820_RAM:      return "RAM";
-    case E820_RESERVED: return "RESERVED";
-    case E820_ACPI:     return "ACPI";
-    case E820_NVS:      return "NVS";
-    case E820_UNUSABLE: return "UNUSABLE";
-    }
-    return "";
-}
-
-struct e820entry {
-    uint64_t addr;
-    uint64_t size;
-    uint32_t type;
-} __attribute__((packed));
-
-static unsigned e820type(unsigned uefi_mem_type) {
-    switch (uefi_mem_type) {
-    case EfiReservedMemoryType:
-    case EfiPalCode:
-        return E820_RESERVED;
-    case EfiRuntimeServicesCode:
-    case EfiRuntimeServicesData:
-#if WITH_RUNTIME_SERVICES
-        return E820_RESERVED;
-#else
-        return E820_RAM;
-#endif
-    case EfiACPIReclaimMemory:
-        return E820_ACPI;
-    case EfiACPIMemoryNVS:
-        return E820_NVS;
-    case EfiLoaderCode:
-    case EfiLoaderData:
-    case EfiBootServicesCode:
-    case EfiBootServicesData:
-    case EfiConventionalMemory:
-        return E820_RAM;
-    case EfiMemoryMappedIO:
-    case EfiMemoryMappedIOPortSpace:
-        return E820_IGNORE;
-    default:
-        if (uefi_mem_type >= 0x80000000) {
-            return E820_RAM;
-        }
-        return E820_UNUSABLE;
-    }
-}
-
-static unsigned char scratch[32768];
-static struct e820entry e820table[128];
-
-static int process_memory_map(efi_system_table* sys, size_t* _key, int silent) {
-    efi_memory_descriptor* mmap;
-    struct e820entry* entry = e820table;
-    size_t msize, off;
-    size_t mkey, dsize;
-    uint32_t dversion;
-    unsigned n, type;
-    efi_status r;
-
-    msize = sizeof(scratch);
-    mmap = (efi_memory_descriptor*)scratch;
-    mkey = dsize = dversion = 0;
-    r = sys->BootServices->GetMemoryMap(&msize, mmap, &mkey, &dsize, &dversion);
-    if (!silent)
-        printf("r=%zx msz=%zx key=%zx dsz=%zx dvn=%x\n", r, msize, mkey, dsize, dversion);
-    if (r != EFI_SUCCESS) {
-        return -1;
-    }
-    if (msize > sizeof(scratch)) {
-        if (!silent)
-            printf("Memory Table Too Large (%zu entries)\n", (msize / dsize));
-        return -1;
-    }
-    for (off = 0, n = 0; off < msize; off += dsize) {
-        mmap = (efi_memory_descriptor*)(scratch + off);
-        type = e820type(mmap->Type);
-        if (type == E820_IGNORE) {
-            continue;
-        }
-        if ((n > 0) && (entry[n - 1].type == type)) {
-            if ((entry[n - 1].addr + entry[n - 1].size) == mmap->PhysicalStart) {
-                entry[n - 1].size += mmap->NumberOfPages * PAGE_SIZE;
-                continue;
-            }
-        }
-        entry[n].addr = mmap->PhysicalStart;
-        entry[n].size = mmap->NumberOfPages * PAGE_SIZE;
-        entry[n].type = type;
-        n++;
-        if (n == 128) {
-            if (!silent)
-                printf("E820 Table Too Large (%zu raw entries)\n", (msize / dsize));
-            return -1;
-        }
-    }
-    *_key = mkey;
-    return n;
-}
-
-#define ZP_E820_COUNT 0x1E8   // byte
-#define ZP_SETUP 0x1F1        // start of setup structure
-#define ZP_SETUP_SECTS 0x1F1  // byte (setup_size/512-1)
-#define ZP_JUMP 0x200         // jump instruction
-#define ZP_HEADER 0x202       // word "HdrS"
-#define ZP_VERSION 0x206      // half 0xHHLL
-#define ZP_LOADER_TYPE 0x210  // byte
-#define ZP_RAMDISK_BASE 0x218 // word (ptr or 0)
-#define ZP_RAMDISK_SIZE 0x21C // word (bytes)
-#define ZP_EXTRA_MAGIC 0x220  // word
-#define ZP_CMDLINE 0x228      // word (ptr)
-#define ZP_SYSSIZE 0x1F4      // word (size/16)
-#define ZP_XLOADFLAGS 0x236   // half
-#define ZP_E820_TABLE 0x2D0   // 128 entries
-
-#define ZP_ACPI_RSD 0x080 // word phys ptr
-#define ZP_FB_BASE 0x090
-#define ZP_FB_WIDTH 0x094
-#define ZP_FB_HEIGHT 0x098
-#define ZP_FB_STRIDE 0x09C
-#define ZP_FB_FORMAT 0x0A0
-#define ZP_FB_REGBASE 0x0A4
-#define ZP_FB_SIZE 0x0A8
-
-#define ZP_MAGIC_VALUE 0xDBC64323
-
-#define ZP8(p, off) (*((uint8_t*)((p) + (off))))
-#define ZP16(p, off) (*((uint16_t*)((p) + (off))))
-#define ZP32(p, off) (*((uint32_t*)((p) + (off))))
-
-static void install_memmap(kernel_t* k, struct e820entry* memmap, unsigned count) {
-    memcpy(k->zeropage + ZP_E820_TABLE, memmap, sizeof(*memmap) * count);
-    ZP8(k->zeropage, ZP_E820_COUNT) = count;
-}
-
-static void start_deprecated(kernel_t* k) {
-    // 64bit entry is at offset 0x200
-    uint64_t entry = (uint64_t)(k->image + 0x200);
-
-    // ebx = 0, ebp = 0, edi = 0, esi = zeropage
-    __asm__ __volatile__(
-        "movl $0, %%ebp \n"
-        "cli \n"
-        "jmp *%[entry] \n" ::[entry] "a"(entry),
-        [zeropage] "S"(k->zeropage),
-        "b"(0), "D"(0));
-    for (;;)
-        ;
-}
-
-static int load_deprecated(efi_boot_services* bs, uint8_t* image, size_t sz, kernel_t* k) {
-    uint32_t setup_sz;
-    uint32_t image_sz;
-    uint32_t setup_end;
-    efi_physical_addr mem;
-
-    k->zeropage = NULL;
-    k->cmdline = NULL;
-    k->image = NULL;
-    k->pages = 0;
-
-    if (sz < 1024) {
-        // way too small to be a kernel
-        goto fail;
-    }
-
-    if (ZP32(image, ZP_HEADER) != 0x53726448) {
-        printf("kernel: invalid setup magic %08x\n", ZP32(image, ZP_HEADER));
-        goto fail;
-    }
-    if (ZP16(image, ZP_VERSION) < 0x020B) {
-        printf("kernel: unsupported setup version %04x\n", ZP16(image, ZP_VERSION));
-        goto fail;
-    }
-    setup_sz = (ZP8(image, ZP_SETUP_SECTS) + 1) * 512;
-    image_sz = (ZP32(image, ZP_SYSSIZE) * 16);
-    setup_end = ZP_JUMP + ZP8(image, ZP_JUMP + 1);
-
-    printf("setup %d image %d  hdr %04x-%04x\n", setup_sz, image_sz, ZP_SETUP, setup_end);
-    // image size may be rounded up, thus +15
-    if ((setup_sz < 1024) || ((setup_sz + image_sz) > (sz + 15))) {
-        printf("kernel: invalid image size\n");
-        goto fail;
-    }
-
-    mem = 0xFF000;
-    if (bs->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1, &mem)) {
-        printf("kernel: cannot allocate 'zero page'\n");
-        goto fail;
-    }
-    k->zeropage = (void*)mem;
-
-    mem = 0xFF000;
-    if (bs->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1, &mem)) {
-        printf("kernel: cannot allocate commandline\n");
-        goto fail;
-    }
-    k->cmdline = (void*)mem;
-
-    mem = 0x100000;
-    k->pages = BYTES_TO_PAGES(image_sz);
-    if (bs->AllocatePages(AllocateAddress, EfiLoaderData, k->pages + 1, &mem)) {
-        printf("kernel: cannot allocate kernel\n");
-        goto fail;
-    }
-    k->image = (void*)mem;
-
-    // setup zero page, copy setup header from kernel binary
-    memset(k->zeropage, 0, PAGE_SIZE);
-    memcpy(k->zeropage + ZP_SETUP, image + ZP_SETUP, setup_end - ZP_SETUP);
-
-    memcpy(k->image, image + setup_sz, image_sz);
-
-    // empty commandline for now
-    ZP32(k->zeropage, ZP_CMDLINE) = (uint64_t)k->cmdline;
-    k->cmdline[0] = 0;
-
-    // default to no ramdisk
-    ZP32(k->zeropage, ZP_RAMDISK_BASE) = 0;
-    ZP32(k->zeropage, ZP_RAMDISK_SIZE) = 0;
-
-    // undefined bootloader
-    ZP8(k->zeropage, ZP_LOADER_TYPE) = 0xFF;
-
-    printf("kernel @%p, zeropage @%p, cmdline @%p\n",
-           k->image, k->zeropage, k->cmdline);
-
-    return 0;
-fail:
-    if (k->image) {
-        bs->FreePages((efi_physical_addr)k->image, k->pages);
-    }
-    if (k->cmdline) {
-        bs->FreePages((efi_physical_addr)k->cmdline, 1);
-    }
-    if (k->zeropage) {
-        bs->FreePages((efi_physical_addr)k->zeropage, 1);
-    }
-
-    return -1;
-}
-
-int boot_deprecated(efi_handle img, efi_system_table* sys,
-                    void* image, size_t sz,
-                    void* ramdisk, size_t rsz,
-                    void* cmdline, size_t csz) {
-    efi_boot_services* bs = sys->BootServices;
-    kernel_t kernel;
-    efi_status r;
-    size_t key;
-    int n, i;
-
-    efi_graphics_output_protocol* gop;
-    bs->LocateProtocol(&GraphicsOutputProtocol, NULL, (void**)&gop);
-
-    printf("boot_kernel() from %p (%zu bytes)\n", image, sz);
-    if (ramdisk && rsz) {
-        printf("ramdisk at %p (%zu bytes)\n", ramdisk, rsz);
-    }
-
-    if (load_deprecated(sys->BootServices, image, sz, &kernel)) {
-        printf("Failed to load kernel image\n");
-        return -1;
-    }
-
-    ZP32(kernel.zeropage, ZP_EXTRA_MAGIC) = ZP_MAGIC_VALUE;
-    ZP32(kernel.zeropage, ZP_ACPI_RSD) = find_acpi_root(img, sys);
-
-    ZP32(kernel.zeropage, ZP_FB_BASE) = (uint32_t)gop->Mode->FrameBufferBase;
-    ZP32(kernel.zeropage, ZP_FB_WIDTH) = (uint32_t)gop->Mode->Info->HorizontalResolution;
-    ZP32(kernel.zeropage, ZP_FB_HEIGHT) = (uint32_t)gop->Mode->Info->VerticalResolution;
-    ZP32(kernel.zeropage, ZP_FB_STRIDE) = (uint32_t)gop->Mode->Info->PixelsPerScanLine;
-    ZP32(kernel.zeropage, ZP_FB_FORMAT) = get_mx_pixel_format(gop);
-    ZP32(kernel.zeropage, ZP_FB_REGBASE) = 0;
-    ZP32(kernel.zeropage, ZP_FB_SIZE) = 256 * 1024 * 1024;
-
-    if (cmdline) {
-        memcpy(kernel.cmdline, cmdline, csz);
-    }
-
-    if (ramdisk && rsz) {
-        ZP32(kernel.zeropage, ZP_RAMDISK_BASE) = (uint32_t) (uintptr_t) ramdisk;
-        ZP32(kernel.zeropage, ZP_RAMDISK_SIZE) = rsz;
-    }
-    n = process_memory_map(sys, &key, 0);
-
-    for (i = 0; i < n; i++) {
-        struct e820entry* e = e820table + i;
-        printf("%016" PRIx64 " %016" PRIx64 " %s\n",
-               e->addr, e->size, e820name(e->type));
-    }
-
-    r = sys->BootServices->ExitBootServices(img, key);
-    if (r == EFI_INVALID_PARAMETER) {
-        n = process_memory_map(sys, &key, 1);
-        r = sys->BootServices->ExitBootServices(img, key);
-        if (r) {
-            printf("Cannot ExitBootServices! (2) %s\n", xefi_strerror(r));
-            return -1;
-        }
-    } else if (r) {
-        printf("Cannot ExitBootServices! (1) %s\n", xefi_strerror(r));
-        return -1;
-    }
-
-    install_memmap(&kernel, e820table, n);
-    start_deprecated(&kernel);
-
-    return 0;
-}
@@ -49,19 +49,95 @@ static void start_magenta(uint64_t entry, void* bootdata) {
        ;
 }

+static bool with_extra = false;
+static bootextra_t default_extra = {
+    .reserved0 = 0,
+    .reserved1 = 0,
+    .magic = BOOTITEM_MAGIC,
+    .crc32 = BOOTITEM_NO_CRC32,
+};
+
 static int add_bootdata(void** ptr, size_t* avail,
                        bootdata_t* bd, void* data) {
-    size_t len = BOOTDATA_ALIGN(bd->length);
-    if ((sizeof(bootdata_t) + len) > *avail) {
-        printf("boot: no room for bootdata type=%08x size=%08x\n",
-               bd->type, bd->length);
+    if (with_extra) {
+        size_t len = BOOTDATA_ALIGN(bd->length);
+        if ((sizeof(bootdata_t) + sizeof(bootextra_t) + len) > *avail) {
+            printf("boot: no room for bootdata type=%08x size=%08x\n",
+                   bd->type, bd->length);
+            return -1;
+        }
+        bd->flags |= BOOTDATA_FLAG_EXTRA;
+        memcpy(*ptr, bd, sizeof(bootdata_t));
+        memcpy((*ptr) + sizeof(bootdata_t), &default_extra, sizeof(bootextra_t));
+        memcpy((*ptr) + sizeof(bootdata_t) + sizeof(bootextra_t), data, len);
+        len += sizeof(bootdata_t) + sizeof(bootextra_t);
+        (*ptr) += len;
+        (*avail) -= len;
+    } else {
+        size_t len = BOOTDATA_ALIGN(bd->length);
+        if ((sizeof(bootdata_t) + len) > *avail) {
+            printf("boot: no room for bootdata type=%08x size=%08x\n",
+                   bd->type, bd->length);
+            return -1;
+        }
+        memcpy(*ptr, bd, sizeof(bootdata_t));
+        memcpy((*ptr) + sizeof(bootdata_t), data, len);
+        len += sizeof(bootdata_t);
+        (*ptr) += len;
+        (*avail) -= len;
+    }
+    return 0;
+}
+
+static int header_check(void* image, size_t sz, uint64_t* _entry,
+                        size_t* _hsz, size_t* _flen, size_t* _klen) {
+    bootdata_t* bd = image;
+    size_t hsz, flen, klen;
+    uint64_t entry;
+
+    if (bd->flags & BOOTDATA_FLAG_EXTRA) {
+        hsz = sizeof(bootdata_t) + sizeof(bootextra_t);
+        magenta_kernel2_t* kernel2 = image;
+        if ((sz < sizeof(magenta_kernel2_t)) ||
+            (kernel2->hdr_kernel.type != BOOTDATA_KERNEL) ||
+            ((kernel2->hdr_kernel.flags & BOOTDATA_FLAG_EXTRA) == 0)) {
+            printf("boot: invalid magenta kernel header\n");
+            return -1;
+        }
+        flen = BOOTDATA_ALIGN(kernel2->hdr_file.length);
+        klen = BOOTDATA_ALIGN(kernel2->hdr_kernel.length);
+        entry = kernel2->data_kernel.entry64;
+    } else {
+        hsz = sizeof(bootdata_t);
+        magenta_kernel_t* kernel = image;
+        if ((sz < sizeof(magenta_kernel_t)) ||
+            (kernel->hdr_kernel.type != BOOTDATA_KERNEL)) {
+            printf("boot: invalid magenta kernel header\n");
+            return -1;
+        }
+        flen = BOOTDATA_ALIGN(kernel->hdr_file.length);
+        klen = BOOTDATA_ALIGN(kernel->hdr_kernel.length);
+        entry = kernel->data_kernel.entry64;
+    }
+
+    if (flen > (sz - hsz)) {
+        printf("boot: invalid magenta kernel header (bad flen)\n");
        return -1;
    }
-    memcpy(*ptr, bd, sizeof(bootdata_t));
-    memcpy((*ptr) + sizeof(bootdata_t), data, len);
-    len += sizeof(bootdata_t);
-    (*ptr) += len;
-    (*avail) -= len;
+
+    if (klen > (sz - (hsz * 2))) {
+        printf("boot: invalid magenta kernel header (bad klen)\n");
+        return -1;
+    }
+    if (_entry) {
+        *_entry = entry;
+    }
+    if (_hsz) {
+        *_hsz = hsz;
+        *_flen = flen;
+        *_klen = klen;
+    }
+
    return 0;
 }

@@ -70,48 +146,54 @@ int boot_magenta(efi_handle img, efi_system_table* sys,
                 void* cmdline, size_t csz) {

    efi_boot_services* bs = sys->BootServices;
+    uint64_t entry;

-    magenta_kernel_t* kernel = image;
-    if ((isz < sizeof(magenta_kernel_t)) ||
-        (kernel->hdr_kernel.type != BOOTDATA_KERNEL)) {
-        printf("boot: invalid magenta kernel header\n");
+    if (header_check(image, isz, &entry, NULL, NULL, NULL)) {
        return -1;
    }
-
-    if ((ramdisk == NULL) || (rsz < sizeof(bootdata_t))) {
+    if ((ramdisk == NULL) || (rsz < (sizeof(bootdata_t) + sizeof(bootextra_t)))) {
        printf("boot: ramdisk missing or too small\n");
        return -1;
    }

    bootdata_t* hdr0 = ramdisk;
    if ((hdr0->type != BOOTDATA_CONTAINER) ||
-        (hdr0->extra != BOOTDATA_MAGIC) ||
-        (hdr0->flags != 0)) {
+        (hdr0->extra != BOOTDATA_MAGIC)) {
        printf("boot: ramdisk has invalid bootdata header\n");
        return -1;
    }
-    if ((hdr0->length > (rsz - sizeof(bootdata_t)))) {
+
+    // If the ramdisk container header is a new/large header,
+    // generate all our prepended headers in the same style...
+    size_t hsz = sizeof(bootdata_t);
+    if (hdr0->flags & BOOTDATA_FLAG_EXTRA) {
+        with_extra = true;
+        hsz += sizeof(bootextra_t);
+    }
+
+    if ((hdr0->length > (rsz - hsz))) {
        printf("boot: ramdisk has invalid bootdata length\n");
        return -1;
    }

    // osboot ensures we have FRONT_BYTES ahead of the
    // ramdisk to prepend our own bootdata items.
-    //
-    // We used sizeof(hdr) up front but will overwrite
-    // the header at the start of the ramdisk so it works
-    // out in the end.
-
    bootdata_t hdr;
    void* bptr = ramdisk - FRONT_BYTES;
    size_t blen = FRONT_BYTES;

+    // We create a new container header of the same size
+    // as the one at the start of the ramdisk
    hdr.type = BOOTDATA_CONTAINER;
    hdr.length = hdr0->length + FRONT_BYTES;
    hdr.extra = BOOTDATA_MAGIC;
-    hdr.flags = 0;
+    hdr.flags = with_extra ? BOOTDATA_FLAG_EXTRA : 0;
    memcpy(bptr, &hdr, sizeof(hdr));
    bptr += sizeof(hdr);
+    if (with_extra) {
+        memcpy(bptr, &default_extra, sizeof(default_extra));
+        bptr += sizeof(default_extra);
+    }

    // pass kernel commandline
    hdr.type = BOOTDATA_CMDLINE;
@@ -215,15 +297,18 @@ int boot_magenta(efi_handle img, efi_system_table* sys,
    }

    // fill the remaining gap between pre-data and ramdisk image
-    if ((blen < sizeof(bootdata_t)) || (blen & 7)) {
+    if ((blen < hsz) || (blen & 7)) {
        goto fail;
    }
    hdr.type = BOOTDATA_IGNORE;
-    hdr.length = blen - sizeof(bootdata_t);
+    hdr.length = blen - hsz;
    memcpy(bptr, &hdr, sizeof(hdr));
+    if (with_extra) {
+        memcpy(bptr + sizeof(hdr), &default_extra, sizeof(default_extra));
+    }

    // jump to the kernel
-    start_magenta(kernel->data_kernel.entry64, ramdisk - FRONT_BYTES);
+    start_magenta(entry, ramdisk - FRONT_BYTES);

 fail:
    bs->FreePages(mem, pages);
@@ -232,6 +317,63 @@ fail:

 static char cmdline[CMDLINE_MAX];

+int mxboot(efi_handle img, efi_system_table* sys,
+           void* image, size_t sz) {
+
+    size_t hsz, flen, klen;
+    if (header_check(image, sz, NULL, &hsz, &flen, &klen)) {
+        return -1;
+    }
+
+    // ramdisk portion is file - headers - kernel len
+    uint32_t rlen = flen - hsz - klen;
+    uint32_t roff = (hsz * 2) + klen;
+    if (rlen == 0) {
+        printf("mxboot: no ramdisk?!\n");
+        return -1;
+    }
+
+    // allocate space for the ramdisk
+    efi_boot_services* bs = sys->BootServices;
+    size_t rsz = rlen + hsz + FRONT_BYTES;
+    size_t pages = BYTES_TO_PAGES(rsz);
+    void* ramdisk = NULL;
+    efi_status r = bs->AllocatePages(AllocateAnyPages, EfiLoaderData, pages,
+                                     (efi_physical_addr*)&ramdisk);
+    if (r) {
+        printf("mxboot: cannot allocate ramdisk buffer\n");
+        return -1;
+    }
+
+    ramdisk += FRONT_BYTES;
+    bootdata_t* hdr = ramdisk;
+    hdr->type = BOOTDATA_CONTAINER;
+    hdr->length = rlen;
+    hdr->extra = BOOTDATA_MAGIC;
+    hdr->flags = 0;
+    if (hsz != sizeof(bootdata_t)) {
+        hdr->flags |= BOOTDATA_FLAG_EXTRA;
+        memcpy(hdr + 1, &default_extra, sizeof(default_extra));
+    }
+    memcpy(ramdisk + hsz, image + roff, rlen);
+    rlen += hsz;
+
+    printf("ramdisk @ %p\n", ramdisk);
+
+    size_t csz = cmdline_to_string(cmdline, sizeof(cmdline));
+
+    // shrink original image header to include only the kernel
+    if (hsz == sizeof(bootdata_t)) {
+        magenta_kernel_t* kernel = image;
+        kernel->hdr_file.length = hsz + klen;
+    } else {
+        magenta_kernel2_t* kernel2 = image;
+        kernel2->hdr_file.length = hsz + klen;
+    }
+
+    return boot_magenta(img, sys, image, roff, ramdisk, rlen, cmdline, csz);
+}
+
 int boot_kernel(efi_handle img, efi_system_table* sys,
                void* image, size_t sz,
                void* ramdisk, size_t rsz) {
@@ -240,10 +382,9 @@ int boot_kernel(efi_handle img, efi_system_table* sys,

    bootdata_t* bd = image;
    if ((bd->type == BOOTDATA_CONTAINER) &&
-        (bd->extra == BOOTDATA_MAGIC) &&
-        (bd->flags == 0)) {
+        (bd->extra == BOOTDATA_MAGIC)) {
        return boot_magenta(img, sys, image, sz, ramdisk, rsz, cmdline, csz);
    } else {
-        return boot_deprecated(img, sys, image, sz, ramdisk, rsz, cmdline, csz);
+        return -1;
    }
 }
@@ -8,6 +8,7 @@
 #include <device_id.h>
 #include <inet6.h>
 #include <netifc.h>
+#include <xefi.h>

 #include <magenta/boot/netboot.h>
 #include <tftp/tftp.h>
@@ -240,6 +241,14 @@ static int udp_timeout_set(uint32_t timeout_ms, void* cookie) {
    return 0;
 }

+static int strcmp8to16(const char* str8, const char16_t* str16) {
+    while (*str8 != '\0' && *str8 == *str16) {
+        str8++;
+        str16++;
+    }
+    return *str8 - *str16;
+}
+
 void tftp_recv(void* data, size_t len, const ip6_addr* daddr, uint16_t dport,
               const ip6_addr* saddr, uint16_t sport) {
    static tftp_session* session = NULL;
@@ -258,6 +267,12 @@ void tftp_recv(void* data, size_t len, const ip6_addr* daddr, uint16_t dport,
            return;
        }

+        // Override our window size on the Acer tablet
+        if (!strcmp8to16("INSYDE Corp.", gSys->FirmwareVendor)) {
+            uint16_t window_size = 8;
+            tftp_set_options(session, NULL, NULL, &window_size);
+        }
+
        // Initialize file interface
        tftp_file_interface file_ifc = {NULL, buffer_open, NULL, buffer_write, buffer_close};
        tftp_session_set_file_interface(session, &file_ifc);
@@ -374,11 +374,17 @@ EFIAPI efi_status efi_main(efi_handle img, efi_system_table* sys) {
    printf("\n\n");
    print_cmdline();

-    // Look for a kernel image on disk
-    // TODO: use the filesystem protocol
+    // First look for a self-contained magentaboot image
    size_t ksz = 0;
-    void* kernel = xefi_load_file(L"magenta.bin", &ksz, 0);
+    void* kernel = xefi_load_file(L"mxboot.bin", &ksz, 0);

+    if (kernel) {
+        mxboot(img, sys, kernel, ksz);
+    }
+
+    // Look for a kernel image on disk
+    ksz = 0;
+    kernel = xefi_load_file(L"magenta.bin", &ksz, 0);
    if (!have_network && kernel == NULL) {
        goto fail;
    }
@@ -34,3 +34,6 @@ int boot_deprecated(efi_handle img, efi_system_table* sys,
                    void* image, size_t sz,
                    void* ramdisk, size_t rsz,
                    void* cmdline, size_t csz);
+
+int mxboot(efi_handle img, efi_system_table* sys,
+           void* image, size_t sz);
@@ -0,0 +1,21 @@
+# Architecture Support
+
+Fuchsia supports two ISAs: arm64 and x86-64.
+
+## arm64
+
+Fuchsia supports arm64 (also called AArch64) with no restrictions on
+supported microarchitectures.
+
+## x86-64
+
+Fuchsia supports x86-64 (also called IA32e or AMD64), but with some restrictions
+on supported microarchitectures.
+
+### Intel
+
+For Intel CPUs, only Broadwell and later are actively supported and will have new features added for them.  Additionally, we will accept patches to keep Nehalem and later booting.
+
+### AMD
+
+AMD CPUs are not actively supported (in particular, we have no active testing on them), but we will accept patches to ensure correct booting on them.
@@ -0,0 +1,131 @@
+### Micro-benchmarks
+
+The benchmarks recorded below are obtained by running magenta-benchmarks in a
+release build of fuchsia via ssh. When the benchmarks are recorded the Fuchsia user
+shell (GPU-accelerated) is running but no user has yet logged in.
+
+These are the running processes at the time of the benchmark:
+
+```
+ ps
+TASK                    PSS PRIVATE  SHARED NAME
+j:1029               796.8M  783.9M         root
+  p:1044             558.8M  558.8M     28k bin/devmgr
+  j:1078              48.1M   39.6M         magenta-drivers
+    p:1752           180.8k    180k     28k devhost:root
+    p:1791          1596.8k   1596k     28k devhost:acpi
+    p:1840           592.8k    592k     28k devhost:misc
+    p:4684            35.0M   26.7M   16.5M devhost:pci#1:8086:5916
+    p:4730          1420.8k   1420k     28k devhost:pci#3:8086:9d2f
+    p:4858          8540.8k   8540k     28k devhost:pci#6:8086:9d03
+    p:4979           532.8k    532k     28k devhost:pci#14:8086:9d71
+    p:5052           546.8k    380k    360k devhost:pci#16:8086:15d8
+  j:1179            5745.4k   1624k         magenta-services
+    p:1182           256.8k    256k     28k crashlogger
+    p:1330          4490.8k    440k   8128k virtual-console
+    p:1425           266.8k    200k    160k netsvc
+    p:4547           176.8k    176k     28k sh:console
+    p:6058           184.8k    184k     28k vc:sh
+    p:6093           184.8k    184k     28k vc:sh
+    p:6148           184.8k    184k     28k vc:sh
+  j:1180             184.3M  183.9M         fuchsia
+    p:1234           588.8k    588k     28k appmgr
+    j:2000           183.8M  183.3M         root
+      p:2045         688.8k    688k     28k bootstrap
+      j:2336         183.1M  182.6M         boot
+        p:2427      1320.8k   1320k     28k wlanstack
+        p:2467       316.8k    316k     28k device_runner
+        p:2505       320.8k    320k     28k listen
+        p:2707      3532.8k   3432k    228k netstack
+        p:3001       288.8k    288k     28k device_runner_monitor
+        p:3101       468.8k    468k     28k netconnector
+        p:3412       336.8k    336k     28k trace_manager
+        p:3529       356.8k    356k     28k root_presenter
+        p:3587       124.2M  124.0M    404k flutter:userpicker_device_shell
+        p:3810       288.8k    288k     28k ktrace_provider
+        p:3955       332.8k    332k     28k view_manager
+        p:4110        49.2M   49.1M    356k scene_manager
+        p:4269       456.8k    456k     28k icu_data
+        p:4404       456.8k    456k     28k fonts
+        p:24555      296.8k    296k     28k oauth_token_manager
+        j:3240      1102.3k   1100k         tcp:22
+          j:24964   1102.3k   1100k         fe80::a2b3:ccff:fefb:4467:43218
+            p:24965  712.8k    712k     28k /system/bin/sshd
+            p:25157  216.8k    216k     28k /boot/bin/sh
+            p:25311  172.8k    172k     28k /boot/bin/ps
+
+```
+
+The typical thread load of the system before running the benchmarks:
+
+```
+ cpu    load sched (cs ylds pmpts)  pf  sysc ints (hw  tmr tmr_cb) ipi (rs  gen)
+   0   0.01%        32    0     0    2    51        0    3      3        9    0
+   1   0.03%       255    0     0    3   496        0  115    115       10    0
+   2   0.45%        55    0     0    1  4218        6   11     11        8    0
+   3   0.02%        24    0     0    0    44        0    7      7        5    0
+ cpu    load sched (cs ylds pmpts)  pf  sysc ints (hw  tmr tmr_cb) ipi (rs  gen)
+   0   0.00%        17    0     0    1    27        0    1      1        9    0
+   1   0.00%        13    0     0    1    19        0    2      2        8    0
+   2   0.48%       297    0     0    1  3800        5  129    129        6    1
+   3   0.02%        28    0     0    3    45        0    5      5        9    1
+ cpu    load sched (cs ylds pmpts)  pf  sysc ints (hw  tmr tmr_cb) ipi (rs  gen)
+   0   0.16%       236    0     0   16   483       11   62     62       36   25
+   1   0.19%        96    0     0   35   344        0    5      5       27   39
+   2   0.57%       161    0     0   15  4715        6   15     15       53   31
+   3   0.15%       196    0     0   20   492        0   60     60       32   28
+```
+
+It is believed that the running processes has a very minor impact on benchmark results.
+
+
+## Run 8-17-2017
+
+Intel NUC  Model: NUC7i3BNK
+
+Processor: i3-7100U @ 2.40 GHz (Cache: 3M)
+Memory type: DDR4-2133 1.2V SO-DIMM
+Max Memory Bandwidth 34.1 GB/s
+
+```
+buildid:  GIT_5E66D79D5A167878ACF9A944AF92D0EBB6A60DF2
+ELF build ID: d1af6f49136a548ddc216a079f29341e7f4f8df9
+
+Benchmark                               Time           CPU Iterations
+---------------------------------------------------------------------
+Channel/Create                        896 ns        897 ns     778195
+Channel/Write/64                      728 ns        730 ns     950200   83.6564MB/s
+Channel/Write/1024                    771 ns        773 ns     906612   1.23397GB/s
+Channel/Write/32k                    2147 ns       2149 ns     323812   14.1992GB/s
+Channel/Write/64k                    3600 ns       3599 ns     192480   16.9572GB/s
+Channel/Read/64                       717 ns        718 ns     972365   85.0027MB/s
+Channel/Read/1024                     750 ns        751 ns     934482   1.27003GB/s
+Channel/Read/32k                     2102 ns       2101 ns     332341   14.5272GB/s
+Channel/Read/64k                     3550 ns       3545 ns     198392    17.217GB/s
+ChannelMultiProcess/Write/64        88319 ns       1114 ns     100000   54.7862MB/s
+ChannelMultiProcess/Write/1024     238838 ns       1779 ns     100000   548.933MB/s
+ChannelMultiProcess/Write/32k      322097 ns      22632 ns      38626   1.34843GB/s
+ChannelMultiProcess/Write/64k      207986 ns      39543 ns      19765   1.54353GB/s
+ChannelMultiProcess/Read/64          1141 ns       1025 ns     671510    59.561MB/s
+ChannelMultiProcess/Read/1024        1292 ns       1148 ns     602280   850.681MB/s
+ChannelMultiProcess/Read/32k        19830 ns       5456 ns     128700   5.59307GB/s
+ChannelMultiProcess/Read/64k        38534 ns      10650 ns      67404   5.73121GB/s
+Event/Create                          591 ns        594 ns    1181620
+Event/Close                           681 ns        680 ns    1032407
+Event/Signal                          201 ns        199 ns    3506137
+EventPair/Create                      870 ns        871 ns     802191
+Fifo/Create                          1030 ns       1028 ns     685065
+Port/Create/0                         607 ns        610 ns    1146240
+Port/Create/0                         607 ns        609 ns    1147258
+Socket/Write/64                       698 ns        701 ns    1001960   87.0535MB/s
+Socket/Write/1024                     717 ns        720 ns     969184    1.3249GB/s
+Socket/Write/32k                     3055 ns       3047 ns     230028   10.0172GB/s
+Socket/Write/64k                     5372 ns       5327 ns     131993    11.458GB/s
+Socket/Read/64                        649 ns        652 ns    1073736    93.671MB/s
+Socket/Read/1024                      673 ns        674 ns    1039222   1.41413GB/s
+Socket/Read/32k                      2933 ns       2919 ns     240752   10.4564GB/s
+Socket/Read/64k                      5986 ns       5945 ns     122719   10.2659GB/s
+Syscall/Null                           69 ns         68 ns   10327057
+Syscall/ManyArgs                       77 ns         76 ns    9134297
+Thread/Create                        4992 ns       4967 ns     141135
+```
@@ -2,11 +2,11 @@

 ## Introduction

-The kernel manages a number of different types of Objects.  Those which are accessible
-directly via system calls are actual C++ objects which implement the Dispatcher
-interface.  These are implemented in the kernel's [libmagenta](../kernel/lib/magenta).
-Many are self-contained higher level Objects.  Some wrap lower level lk primitives.
-
+The kernel manages a number of different types of Objects. Those which are
+accessible directly via system calls are C++ classes which implement the
+Dispatcher interface. These are implemented in
+[kernel/object](../kernel/object). Many are self-contained higher-level Objects.
+Some wrap lower-level lk primitives.

 ## [System Calls](syscalls.md)

@@ -31,9 +31,11 @@ and [*mx_port_bind()*](syscalls/port_bind.md).
 [*mx_channel_create()*](syscalls/channel_create.md).  Access to these (and limitations
 upon them) is controlled by the Job in which the calling Process is contained.

-System calls are provided by libmagenta.so, which is a "virtual" shared library (VDSO)
-that the Magenta Kernel provides to userspace.  They are C ELF ABI functions of the
-form *mx_noun_verb()* or *mx_noun_verb_direct-object()*
+System calls are provided by libmagenta.so, which is a "virtual" shared
+library that the Magenta kernel provides to userspace, better known as the
+[*virtual Dynamic Shared Object* or vDSO](vdso.md).
+They are C ELF ABI functions of the form *mx_noun_verb()* or
+*mx_noun_verb_direct-object()*.

 The system calls are defined by [syscalls.sysgen](../system/public/magenta/syscalls.sysgen)
 and processed by the [sysgen](../system/host/sysgen/) tool into include files and glue
@@ -66,15 +68,18 @@ the last one for that Object.

 ## Running Code: Jobs, Processes, and Threads.

-Threads represent threads of execution (CPU registers, stack, etc) within an address
-space which is owned by the Process in which they exist.  Processes are owned by Jobs,
-which define various resource limitations.  Jobs are owned by parent Jobs, all the way
-up to the Root Job which was created by the kernel at boot and passed to "userboot",
-the first userspace Process to begin execution.
+Threads represent threads of execution (CPU registers, stack, etc) within an
+address space which is owned by the Process in which they exist.  Processes are
+owned by Jobs, which define various resource limitations.  Jobs are owned by
+parent Jobs, all the way up to the Root Job which was created by the kernel at
+boot and passed to [`userboot`, the first userspace Process to begin execution](userboot.md).

 Without a Job Handle, it is not possible for a Thread within a Process to create another
 Process or another Job.

+[Program loading](program_loading.md) is provided by userspace facilities and
+protocols above the kernel layer.
+
 See: [process_create](syscalls/process_create.md),
 [process_start](syscalls/process_start.md),
 [thread_create](syscalls/thread_create.md),
@@ -35,42 +35,53 @@ allowed.
  - Global constructors
    - Currently we have these for global data structures.

-## mxtl
-We have built our own template library, called *mxtl*, to
+## fbl
+We have built our own template library, called *fbl*, to
 address our particular needs. This library is split into two parts:

-1. [system/ulib/mxtl](../system/ulib/mxtl) which is usable from both
+1. [system/ulib/fbl](../system/ulib/fbl) which is usable from both
   kernel and userspace.
-2. [kernel/lib/mxtl](../kernel/lib/mxtl) which is usable only from
+2. [kernel/lib/fbl](../kernel/lib/fbl) which is usable only from
    the kernel.

-*mxtl* provides
+*fbl* provides

 - utility code
-  - [basic algorithms](../system/ulib/mxtl/include/mxtl/algorithm.h)
-  - [integer type limits](../system/ulib/mxtl/include/mxtl/limits.h)
-  - [type traits](../system/ulib/mxtl/include/mxtl/type_support.h)
-  - [atomics](../system/ulib/mxtl/include/mxtl/atomic.h)
+  - [basic algorithms](../system/ulib/fbl/include/fbl/algorithm.h)
+  - [integer type limits](../system/ulib/fbl/include/fbl/limits.h)
+  - [type traits](../system/ulib/fbl/include/fbl/type_support.h)
+  - [atomics](../system/ulib/fbl/include/fbl/atomic.h)
+  - [alloc checking new](../system/ulib/fbl/include/fbl/alloc_checker.h)
 - allocators
-  - [slab allocation](../system/ulib/mxtl/include/mxtl/slab_allocator.h)
-  - [slab malloc](../system/ulib/mxtl/include/mxtl/slab_malloc.h)
+  - [slab allocation](../system/ulib/fbl/include/fbl/slab_allocator.h)
+  - [slab malloc](../system/ulib/fbl/include/fbl/slab_malloc.h)
 - arrays
-  - [fixed sized arrays](../system/ulib/mxtl/include/mxtl/array.h)
-  - [fixed sized arrays](../system/ulib/mxtl/include/mxtl/inline_array.h),
+  - [fixed sized arrays](../system/ulib/fbl/include/fbl/array.h)
+  - [fixed sized arrays](../system/ulib/fbl/include/fbl/inline_array.h),
    which stack allocates small arrays
 - inline containers
-  - [doubly linked list](../system/ulib/mxtl/include/mxtl/intrusive_double_list.h)
-  - [hash table](../system/ulib/mxtl/include/mxtl/intrusive_hash_table.h)
-  - [singly linked list](../system/ulib/mxtl/include/mxtl/intrusive_single_list.h)
-  - [wavl trees](../system/ulib/mxtl/include/mxtl/intrusive_wavl_tree.h)
+  - [doubly linked list](../system/ulib/fbl/include/fbl/intrusive_double_list.h)
+  - [hash table](../system/ulib/fbl/include/fbl/intrusive_hash_table.h)
+  - [singly linked list](../system/ulib/fbl/include/fbl/intrusive_single_list.h)
+  - [wavl trees](../system/ulib/fbl/include/fbl/intrusive_wavl_tree.h)
 - smart pointers
-  - [intrusive refcounting mixin](../system/ulib/mxtl/include/mxtl/ref_counted.h)
-  - [intrusive refcounted pointer](../system/ulib/mxtl/include/mxtl/ref_ptr.h)
-  - [unique pointer](../system/ulib/mxtl/include/mxtl/unique_ptr.h)
+  - [intrusive refcounting mixin](../system/ulib/fbl/include/fbl/ref_counted.h)
+  - [intrusive refcounted pointer](../system/ulib/fbl/include/fbl/ref_ptr.h)
+  - [unique pointer](../system/ulib/fbl/include/fbl/unique_ptr.h)
 - raii utilities
-  - [auto call](../system/ulib/mxtl/include/mxtl/auto_call.h) to run
+  - [auto call](../system/ulib/fbl/include/fbl/auto_call.h) to run
    code upon leaving scope
-  - [AutoLock](../system/ulib/mxtl/include/mxtl/auto_lock.h)
+  - [AutoLock](../system/ulib/fbl/include/fbl/auto_lock.h)
+
+The standard operator new is assumed to either return valid memory or
+to throw std::bad_alloc. This policy is not suitable for the
+kernel. We also want to dynamically enforce that returns are
+explicitly checked. As such, fbl introduces our own operator new
+overload which takes a reference to an `AllocChecker`. If the status
+of the `AllocChecker` is not queried after the new expression, an
+assertion is raised. This lets us enforce that the return value is
+checked without having to reason about optimizations of the standard
+operator new in the presence of -fno-exceptions and so on.

 ## mx

@@ -94,19 +105,3 @@ the libc. See extensive comments in musl's atexit implementation if
 you are curious.

 *This library is mutually exclusive of the standard C++ library.*
-
-## mxalloc
-
-The standard operator new is assumed to either return valid memory or
-to throw std::bad_alloc. This policy is not suitable for the
-kernel. We also want to dynamically enforce that returns are
-explicitly checked. As such, [the mxalloc
-library](../system/ulib/mxalloc) introduces our own operator new
-overload which takes a reference to an `AllocChecker`. If the status
-of the `AllocChecker` is not queried after the new expression, an
-assertion is raised. This lets us enforce that the return value is
-checked without having to reason about optimizations of the standard
-operator new in the presence of -fno-exceptions and so on.
-
-This library can be linked into programs that use the standard
-library, and also into programs that use `mxcpp`.
@@ -0,0 +1,41 @@
+# ACPI debugging
+
+## ACPICA debug interfaces
+
+To turn on ACPICA's debug output, pass "ENABLE\_ACPI\_DEBUG=1" to make.  When this
+option is enabled, ACPICA uses two global variables to control debug output.
+
+### AcpiDbgLevel
+
+AcpiDbgLevel is a bitmap of values defined in
+third\_party/lib/acpica/source/include/acpica/acoutput.h with the prefix
+"ACPI\_LV\_".  For convenience, there are some pre-defined verbosity levels:
+ACPI\_LV\_VERBOSITY1, ACPI\_LV\_VERBOSITY2, ACPI\_LV\_VERBOSITY3.  These control
+types of tracing events to log.  For example, if you want to trace all function
+calls and mutex operations, you can set AcpiDbgLevel to
+
+"ACPI\_LV\_FUNCTIONS | ACPI\_LV\_MUTEX"
+
+### AcpiDbgLayer
+
+AcpiDbgLayer is a bitmap of values defined in
+third\_party/lib/acpica/source/include/acpica/acoutput.h.  These do not have a
+common prefix, but are listed as "Component IDs".  These control which
+submodules of ACPICA are to be traced.  For example, to trace through the
+namespace logic and and the executor, you can set AcpiDbgLayer to
+
+"ACPI\_NAMESPACE | ACPI\_EXECUTOR"
+
+### Setting these values
+
+One easy place to set these in the AcpiOsInitialize method that we define in
+third\_party/lib/acpica/source/os\_specific/service\_layers/osfuchsia.cpp.
+One technique that may be useful is zeroing both values in AcpiOsInitialize, and
+setting it to a non-zero value immediate before a call into ACPICA of interest.
+
+### AcpiDebugTrace
+
+There is additionally a method named AcpiDebugTrace in the ACPIA API.  It
+supposedly supports tracing particular ACPI methods by their 4-character
+namespace names (but with no scoping to particular Nodes).  See the ACPICA
+manual for details.
@@ -212,7 +212,7 @@ Notes
   values would range from [0x0000, 0xFFFF] with 0x8000 representing zero
   deflection.
 * When used to set formats, exactly one non-flag bit **must** be set.
- * When used to describe supported formats, and number of non-flag bits **may**
+ * When used to describe supported formats, any number of non-flag bits **may**
   be set.  Flags (when present) apply to all of the relevant non-flag bits in
   the bitfield.  eg.  If a stream supports COMPRESSED, 16BIT and 32BIT_FLOAT, and
   the UNSIGNED bit is set, it applies only to the 16BIT format.
@@ -231,7 +231,134 @@ Notes

 ### Enumeration of supported formats

-> TODO: define how to do this using fixed length messages
+In order to determine the formats supported by a given audio stream,
+applications send an `AUDIO_STREAM_CMD_GET_FORMATS` message over the stream
+channel.  No additional parameters are required.  Drivers **must** respond to
+this request using one or more `audio_stream_cmd_get_formats_resp_t` messages,
+even if only to report that there are no formats currently supported.
+
+### Range structures
+
+Drivers indicate support for formats by sending messages containing zero or more
+`audio_stream_format_range_t` structures.  Each structure contains field which
+describe...
+ * A bitmask of supported sample formats.
+ * A minimum and maximum number of channels.
+ * A set of frame rates.
+
+A single range structure indicates support for each of the combinations of the
+three different sets of values (sample formats, channel counts, and frame
+rates).  For example, if a range structure indicated support for...
+ * 16 bit signed LPCM samples
+ * 48000, and 44100 Hz frame rates
+ * 1 and 2 channels
+
+Then the fully expanded set of supported formats indicated by the range
+structure would be...
+ * Stereo 16-bit 48 KHz audio
+ * Stereo 16-bit 44.1 KHz audio
+ * Mono 16-bit 48 KHz audio
+ * Mono 16-bit 44.1 KHz audio
+
+See the Sample Formats section (above) for a description of how sample formats
+are encoded in the `sample_formats` member of a range structure.
+
+Supported channel counts are indicated using a pair of min/max channels fields
+which indicate an exclusive range of channel counts which apply to this range.
+For example, a min/max channels range of [1, 4] would indicate that this audio
+stream supports 1, 2, 3 or 4 channels.  A range of [2, 2] would indicate that
+this audio stream supports only stereo audio.
+
+Supported frame rates are signalled similarly to channel counts using a pair of
+min/max frame per second fields along with a flags field.  While the min/max
+values provide an inclusive range of frame rates, the flags determine how to
+interpret this range.  Currently defined flags include...
+Flag | Definition
+-----|-----------
+`ASF_RANGE_FLAG_FPS_CONTINUOUS` | The frame rate range is continuous.  All frame rates in the range [min, max] are valid.
+`ASF_RANGE_FLAG_FPS_48000_FAMILY` | The frame rate range includes the members of the 48 KHz family which exist in the range [min, max]
+`ASF_RANGE_FLAG_FPS_44100_FAMILY` | The frame rate range includes the members of the 44.1 KHz family which exist in the range [min, max]
+
+So, conceptually, the valid frame rates are the union of the sets produced by
+applying each of the flags which are set to the inclusive [min, max] range.  For
+example, if both the 48 KHz and 44.1 KHz were set, and the range given was
+[16000, 47999], then the supported frame rates for this range would be
+ * 16000 Hz
+ * 22050 Hz
+ * 32000 Hz
+ * 44100 Hz
+
+The official members of the 48 KHz and 44.1 KHz families are
+Family | Frame Rates
+-------|------------
+`ASF_RANGE_FLAG_FPS_48000_FAMILY` | 8000 16000 32000 48000 96000 192000 384000 768000
+`ASF_RANGE_FLAG_FPS_44100_FAMILY` | 11025 22050 44100 88200 176400
+
+Drivers **must** set at least one of the flags, or else the set of supported
+frame rates is empty and there was no reason to transmit this range structure.
+Also note that the set of valid frame rates is the union of the frame rates
+produce by applying each of the set flags.  This implies that there is never any
+good reason to set the `ASF_RANGE_FLAG_FPS_CONTINUOUS` in conjunction with any
+of the other flags.  While it is technically legal to do so, drivers **should**
+avoid this behavior.
+
+### Transporting range structures
+
+Range structures are transmitted from drivers to applications using the
+`audio_stream_cmd_get_formats_resp_t` message.  Because of the large number of
+formats which may be supported by a stream, drivers may need to send multiple
+messages in order to enumerate all available modes.  Messages include the
+following fields.
+ * A standard `audio_cmd_hdr_t` header.  **All** messages involved in the
+   response to an application request **must** use the transaction ID of the
+   original request, and **must** set the cmd field of the header to
+   `AUDIO_STREAM_CMD_GET_FORMATS`.
+ * A `format_range_count` field.  This indicates the total number of format
+   range structures which will be sent in this response to the application.
+   This number **must** be present in **all** messages involved in the response,
+   and **must not** change from message to message.
+ * A `first_format_range_ndx` field indicating the zero-based index of the first
+   format range being specified in this particular message.  See below for
+   details.
+ * An array of `audio_stream_cmd_get_formats_resp_t` structures which is at most
+   `AUDIO_STREAM_CMD_GET_FORMATS_MAX_RANGES_PER_RESPONSE` elements long.
+
+Drivers **must**
+ * Always transmit all of the available audio format ranges.
+ * Always transmit the available audio format ranges in ascending index order.
+ * Always pack as many ranges as possible in the fixed size message structure.
+ * Never overlap index regions or leave gaps.
+
+Given these requirements, if the maximum number of ranges per response were 15,
+and a driver needed to send 35 ranges in response to an application's request,
+then 3 messages in total would be needed, and the `format_range_count` and
+`first_format_range_ndx` fields for each message would be as follows.
+Msg # | `format_range_count` | `first_format_range_ndx`
+------|----------------------|-------------------------
+1 | 35 | 0
+2 | 35 | 15
+3 | 35 | 30
+
+`first_format_range_ndx` **must** never be greater than `format_range_count`,
+however `format_range_count` **may** be zero if an audio stream currently
+supports no formats.  The total number of `audio_stream_format_range_t`
+structures in an `audio_stream_cmd_get_formats_resp_t` message is given by the
+formula
+
+```C
+valid_ranges = MIN(AUDIO_STREAM_CMD_GET_FORMATS_MAX_RANGES_PER_RESPONSE,
+                   msg.format_range_count - msg.first_format_range_ndx);
+```
+
+Drivers **may** choose to always send an entire
+`audio_stream_cmd_get_formats_resp_t` message, or to send a truncated message
+which ends after the last valid range structure in the `format_ranges` array.
+Applications **must** be prepared to receive up to
+`sizeof(audio_stream_cmd_get_formats_resp_t) bytes for each message, but also
+accept messages as short as `offsetof(audio_stream_cmd_get_formats_resp_t, format_ranges)`
+
+> TODO: how do devices signal a change of supported formats (think, HDMI hotplug
+> event)?  Are such devices required to simply remove and republish the device?

 > TODO: define how to enumerate supported compressed bitstream formats.

@@ -0,0 +1,201 @@
+# Entropy quality tests
+
+This document describes how we test the quality of the entropy sources used to
+seed the Magenta CPRNG.
+
+[TOC]
+
+## Theoretical concerns
+
+Approximately speaking, it's sometimes easy to tell that a stream of numbers is
+not random by recognizing a pattern in it. It's impossible to be sure that the
+numbers are truly random. The state of the art seems to be running several
+statistical tests on the data, and hoping to detect any exploitable weaknesses.
+
+The problem of testing for randomness gets more difficult when the random
+numbers aren't perfectly random (when their distributions aren't uniform, or
+when there are some limited correlations between numbers in the sequence). A
+stream of non-perfect random numbers still contains some randomness, but it's
+hard to determine how random it is.
+
+For our purposes, a good measure of how much randomness is contained in a stream
+of non-perfectly random numbers is the min-entropy. This is related to the
+Shannon entropy used in information theory, but is always takes a smaller value.
+The min-entropy controls how much randomness we can reliably extract from the
+entropy source; see, for example
+<https://en.wikipedia.org/wiki/Randomness_extractor#Formal_definition_of_extractors>
+
+From a practical standpoint, we can use the test suite described in US NIST
+SP800-90B to analyze samples of random from an entropy source. A prototype
+implementation for the tests is available from
+<https://github.com/usnistgov/SP800-90B_EntropyAssessment>. The suite takes a
+sample data file (say, 1MB of random bytes) as input. The nice thing about this
+test suite is that it can handle non-perfect RNGs, and it reports an estimate
+for how much min-entropy is contained in each byte of the random data stream.
+
+### The importance of testing unprocessed data
+
+After drawing entropy from our entropy sources, we will mix it into the CPRNG in
+a "safe" way that basically gets rid of detectable correlations and
+distributional imperfections in the raw random byte stream from the entropy
+source. This is a very important thing to do when actually generating random
+numbers to use, but we must avoid this mixing and processing phase when testing
+the entropy source itself.
+
+For a stark example of why it's important to test unprocessed data if we want to
+test our actual entropy sources, here's an experiment. It should run on any
+modern linux system with OpenSSL installed.
+
+    head -c 1000000 /dev/zero >zero.bin
+    openssl enc -aes-256-ctr -in zero.bin -out random.bin -nosalt -k "password"
+
+This takes one million bytes from /dev/zero, encrypts them via AES-256, with a
+weak password and no salt (a terrible crypto scheme, of course!). The fact that
+the output looks like good random data is a sign that AES is working as
+intended, but this demonstrates the risk of estimating entropy content from
+processed data: together, /dev/zero and "password" provide ~0 bits of entropy,
+but our tests are way more optimistic about the resulting data!
+
+For a more concrete Magenta-related example, consider jitterentropy (the RNG
+discussed here: <http://www.chronox.de/jent/doc/CPU-Jitter-NPTRNG.html>).
+Jitterentropy draws entropy from variations in CPU timing. The unprocessed data
+are how long it took to run a certain block of CPU- and memory-intensive code
+(in nanoseconds). Naturally, these time data are not perfectly random: there's
+an average value that they center around, with some fluctuations. Each
+individual data sample might be several bits (e.g. a 64-bit integer) but only
+contribute 1 bit or less of min-entropy.
+
+The full jitterentropy RNG code takes several raw time data samples and
+processes them into a single random output (by shifting through a LFSR, among
+other things). If we test the processed output, we're seeing apparent randomness
+both from the actual timing variations and from the LFSR. We want to focus on
+just the timing variation, so we should test the raw time samples. Note that
+jitterentropy's built-in processing can be turned on and off via the
+`kernel.jitterentropy.raw` cmdline.
+
+## Quality test implementation
+
+As mentioned above, the NIST test suite takes a file full of random bytes as
+input. We collect those bytes on a Magenta system (possibly with a thin Fuchsia
+layer on top), then usually export them to a more capable workstation to run the
+test suite.
+
+## Boot-time tests
+
+Some of our entropy sources are read during boot, before userspace is started.
+To test these entropy sources in a realistic environment, we run the tests
+during boot. The relevant code is in
+`kernel/lib/crypto/entropy/quality\_test.cpp`, but the basic idea is that the
+kernel allocates a large static buffer to hold test data during early boot
+(before the VMM is up, so before it's possible to allocate a VMO). Later on, the
+data is copied into a VMO, and the VMO is passed to userboot and devmgr, where
+it's presented as a pseudo-file at `/boot/kernel/debug/entropy.bin`. Userspace
+apps can read this file and export the data (by copying to persistent storage or
+using the network, for example).
+
+### Boot-time tests: building
+
+Since the boot-time entropy test requires that a large block of memory be
+permanently reserved (for the temporary, pre-VMM buffer), we don't usually build
+the entropy test mode into the kernel. The tests are enabled by passing the
+`ENABLE_ENTROPY_COLLECTOR_TEST` flag at build time, e.g. by adding
+
+```
+EXTERNAL_DEFINES += ENABLE_ENTROPY_COLLECTOR_TEST=1
+```
+
+to `local.mk`. Currently, there's also a build-time constant,
+`ENTROPY_COLLECTOR_TEST_MAXLEN`, which (if provided) is the size of the
+statically allocated buffer. The default value if unspecified is 128kB.
+
+### Boot-time tests: configuring
+
+The boot-time tests are controlled via kernel cmdlines. The relevant cmdlines
+are `kernel.entropy-test.*`, documented in
+[kernel\_cmdlines.md](kernel_cmdlines.md).
+
+Some entropy sources, notably jitterentropy, have parameter values that can be
+tweaked via kernel cmdline. Again, see [kernel\_cmdlines.md](kernel_cmdlines.md)
+for further details.
+
+### Boot-time tests: running
+
+The boot-time tests will run automatically during boot, as long as the correct
+kernel cmdlines are passed (if there are problems with the cmdlines, error
+messages will be printed instead). The tests run just before the first stage of
+RNG seeding, which happens at LK\_INIT\_LEVEL\_PLATFORM\_EARLY, shortly before
+the heap the VMM are brought up. If running a large test, boot will often slow
+down noticeably. For example, collecting 128kB of data from jitterentropy on
+rpi3 can take around a minute, depending on the parameter values.
+
+## Run-time tests
+
+*TODO(SEC-29): discuss actual user-mode test process*
+
+*Current rough ideas: only the kernel can trigger hwrng reads. To test,
+userspace issues a kernel command (e.g. `k hwrng test`), with some arguments to
+specify the test source and length. The kernel collects random bytes into the
+existing VMO-backed pseudo-file at `/boot/kernel/debug/entropy.bin`, assuming
+that this is safely writeable. Currently unimplemented; blocked by lack of a
+userspace HWRNG driver. Can test the VMO-rewriting mechanism first.*
+
+## Test data export
+
+Test data is saved in `/boot/kernel/debug/entropy.bin` in the Magenta system
+under test. So far I've usually exported the data file manually via `netcp`.
+Other options include `scp` if you build with the correct Fuchsia packages, or
+saving to persistent storage (probably using the Fuchsia `thinfs` FAT
+filesystem, so you can read the files on a non-Magenta computer).
+
+## Running the NIST test suite
+
+*Note: the NIST tests aren't actually mirrored in Fuchsia yet. Today, you need
+to clone the tests from the repo at
+<https://github.com/usnistgov/SP800-90B_EntropyAssessment>.*
+
+The NIST test suite has three entry points (as of the version committed on Oct.
+25, 2016): `iid_main.py`, `noniid_main.py`, and `restart.py`. The two "main"
+scripts perform the bulk of the work. The `iid_main.py` script is meant for
+entropy sources that produce independent, identically distributed data samples.
+Most of the testing is to validate the iid condition. Many entropy sources will
+not be iid, so the `noniid_main.py` test implements several entropy estimators
+that don't require iid data.
+
+Note that the test binaries from the NIST repo are Python scripts without a
+shebang line, so you probably need to explicitly call `python` on the command
+line when invoking them.
+
+The first two scripts take two arguments, both mandatory: the data file to read,
+and the number of significant bits per sample (if less than 8, only the low `N`
+bits will be used from each byte). They optionally accept a `-v` flag to produce
+verbose output or `-h` for help.
+
+The `noniid_main.py` also optionally accepts a `-u <int>` flag that can reduce
+the number of bits below the `N` value passed in the second mandatory argument.
+I'm not entirely sure why this flag is provided; it seems functionally
+redundant, but passing it does change the verbose output slightly. My best guess
+is that this is provided because the noniid Markov test only works on samples of
+at most 6 bits, so 7- or 8-bit datasets will be reduced to their low 6 bits for
+this test. In contrast, all the iid tests can run on 8-bit samples.
+
+A sample invocation of the `iid_main.py` script:
+
+```
+python2 -- $FUCHSIA_DIR/third_party/sp800-90b-entropy-assessment/iid_main.py -v /path/to/datafile.bin 8
+```
+
+The `restart.py` script takes the same two arguments, plus a third argument: the
+min-entropy estimate returned by a previous run of `iid_main.py` or
+`noniid_main.py`. This document doesn't describe restart tests. For now, see
+NIST SP800-90B for more details.
+
+## Future directions
+
+### Automation
+
+It would be nice to automate the process of building, configuring, and running a
+quality test. As a first step, it should be easy to write a shell script to
+perform these steps. Even better would be to use the testing infrastructure to
+run entropy collector quality tests this automatically, mostly to reduce bit-rot
+in the test code. Failing automation, we have to rely on humans to periodically
+run the tests (or to fix the tests when they break).
@@ -1,6 +1,12 @@
 # Quick Start Recipes

-## Checking out the source code
+## Checking out the Magenta source code
+
+*** note
+NOTE: The Fuchsia source includes Magenta. See Fuchsia's
+[Getting Started](https://fuchsia.googlesource.com/docs/+/master/getting_started.md)
+doc. Follow this doc to work on only Magenta.
+***

 The Magenta Git repository is located
 at: https://fuchsia.googlesource.com/magenta
@@ -8,7 +14,7 @@ at: https://fuchsia.googlesource.com/magenta
 To clone the repository, assuming you setup the $SRC variable
 in your environment:
 ```shell
-$ git clone https://fuchsia.googlesource.com/magenta $SRC/magenta
+git clone https://fuchsia.googlesource.com/magenta $SRC/magenta
 ```

 For the purpose of this document, we will assume that Magenta is checked
@@ -176,6 +182,14 @@ For QEMU, use the -x option to the run-magenta-* scripts to specify an extra boo

 ## Network Booting

+Network booting is supported via two mechanisms: Gigaboot and Magentaboot.
+Gigaboot is an EFI based bootloader whereas magentaboot is a mechanism that
+allows a minimal magenta system to serve as a bootloader for magenta.
+
+On systems that boot via EFI (such as Acer and NUC), either option is viable.
+On other systems, magentaboot may be the only option for network booting.
+
+### Via Gigaboot
 The [GigaBoot20x6](https://fuchsia.googlesource.com/magenta/+/master/bootloader) bootloader speaks a simple network boot protocol (over IPV6 UDP)
 which does not require any special host configuration or privileged access to use.

@@ -198,6 +212,16 @@ By default bootserver will continue to run and every time it obsveres a netboot
 beacon it will send the kernel (and bootfs if provided) to that device.  If you
 pass the -1 option, bootserver will exit after a successful boot instead.

+
+### Via Magentaboot
+Magentaboot is a mechanism that allows a magenta system to serve as the
+bootloader for magenta itself. Magentaboot speaks the same boot protocol as
+Gigaboot described above.
+
+To use magentaboot, pass the `netsvc.netboot=true` argument to magenta via the
+kernel command line. When magentaboot starts, it will attempt to fetch and boot
+into a magenta system from a bootserver running on the attached host.
+
 ## Network Log Viewing

 The default build of Magenta includes a network log service that multicasts the
@@ -215,7 +239,7 @@ $BUILDDIR/tools/loglistener
 ## Debugging

 For random tips on debugging in the magenta environment see
-[debugging](debugging.md).
+[debugging](debugging/tips.md).

 ## Contribute changes
 * See [contributing.md](contributing.md).
@@ -3,32 +3,66 @@
 The Magenta hypervisor can be used to run a guest operating system. It is a work
 in progress.

-## Run a guest
+## Running a guest

-To run a guest using the hypervisor, you must create a bootfs image
-containing the guest, and use the `guest` app to launch it.
+To run a guest using the hypervisor, you must create a bootfs image containing
+the guest and use the `guest` app to launch it.

-`guest` currently supports Magenta and Linux kernels.
+Note: `guest` only supports the Magenta and Linux kernels.

 On your host device, from the Magenta directory, run:
-
 ```
 scripts/build-magenta-x86-64
-system/uapp/guest/scripts/mklinux.sh  # (optional) Linux only - will download and build the kernel
+
+# Optional: Build Linux, an initial RAM disk, and an EXT2 file-system.
+system/uapp/guest/scripts/mklinux.sh
+system/uapp/guest/scripts/mktoybox.sh -ri
+
+# Optional: Build a GPT disk image for Magenta guests.
+system/uapp/guest/scripts/mkgpt.sh
+
 system/uapp/guest/scripts/mkbootfs.sh
-build-magenta-pc-x86-64/tools/bootserver build-magenta-pc-x86-64/magenta.bin build-magenta-pc-x86-64/bootdata-with-kernel.bin
+build-magenta-pc-x86-64/tools/bootserver \
+    build-magenta-pc-x86-64/magenta.bin \
+    build-magenta-pc-x86-64/bootdata-with-guest.bin
 ```

-After netbooting the target device, for Magenta run:
+### Magenta guest

+After netbooting the target device, to run Magenta:
 ```
-/boot/bin/guest /boot/data/kernel.bin /boot/data/bootdata.bin
+/boot/bin/guest -r /boot/data/bootdata.bin /boot/data/magenta.bin
 ```

-And for Linux run:
-
+To run Magenta using a GPT disk image:
 ```
-/boot/bin/guest /boot/data/bzImage
+/boot/bin/guest \
+    -b /boot/data/magenta.gpt \
+    -r /boot/data/bootdata.bin \
+    /boot/data/magenta.bin
 ```

-You should then see the serial output of the guest operating system.
+### Linux guest
+
+After netbooting the target device, to run Linux using an initial RAM disk:
+```
+/boot/bin/guest -r /boot/data/initrd /boot/data/bzImage
+```
+
+To run Linux using a **read-only** EXT2 root file-system:
+```
+/boot/bin/guest \
+    -b /boot/data/rootfs.ext2 \
+    -c 'root=/dev/vda ro init=/init' \
+    /boot/data/bzImage
+```
+
+To run Linux using a **writable** EXT2 root file-system:
+```
+cp /boot/data/rootfs.ext2 /boot/data/rootfs-rw.ext2
+
+/boot/bin/guest \
+    -b /boot/data/rootfs-rw.ext2 \
+    -c 'root=/dev/vda rw init=/init' \
+    /boot/data/bzImage
+```
@@ -53,19 +53,19 @@ MAGENTA\_DRIVER\_BEGIN macro.

 Example: `driver.usb-audio.disable`

-## kernel.entropy=\<hex>
+## driver.\<name>.log=\<flags>

-Provides entropy to be mixed into the kernel's CPRNG.
+Set the log flags for a driver.  Flags are one or more comma-separated
+values which must be preceeded by a "+" (in which case that flag is enabled)
+or a "-" (in which case that flag is disabled).  The textual constants
+"error", "info", "trace", "spew", "debug1", "debug2", "debug3", and "debug4"
+may be used, and they map to the corresponding bits in DDK_LOG_... in `ddk/debug.h`
+The default log flags for a driver is "error" and "info".

-## kernel.halt_on_panic=\<bool>
-If this option is set (disabled by default), the system will halt on
-a kernel panic instead of rebooting.
+Individual drivers may define their own log flags beyond the eight mentioned
+above.

-## kernel.memory-limit-mb=\<num>
-
-This option tells the kernel to limit system memory to the MB value specified
-by 'num'. Using this effectively allows a user to simulate the system having
-less physical memory than physically present.
+Example: `driver.usb-audio.log=-error,+info,+0x1000`

 ## gfxconsole.early=\<bool>

@@ -82,22 +82,112 @@ needed for debugging it may speed up boot to disable it.
 This option asks the graphics console to use a specific font.  Currently
 only "9x16" (the default) and "18x32" (a double-size font) are supported.

-## virtcon.disable
+## kernel.entropy-mixin=\<hex>

-Do not launch the virtual console service if this option is present.
+Provides entropy to be mixed into the kernel's CPRNG.

-## virtcon.keep-log-visible
+## kernel.entropy-test.len=\<len>

-If this option is present, the virtual console service will keep the
-debug log (vc0) visible instead of switching to the first shell (vc1) at startup.
+When running an entropy collector quality test, collect the provided number of
+bytes. Defaults to the maximum value `ENTROPY_COLLECTOR_TEST_MAXLEN`.

-## virtcon.keymap=\<name>
+The default value for the compile-time constant `ENTROPY_COLLECTOR_TEST_MAXLEN`
+is 128 KiB.

-Specify the keymap for the virtual console.  "qwerty" and "dvorak" are supported.
+## kernel.entropy-test.src=\<source>

-## virtcon.font=\<name>
+When running an entropy collector quality test, use the provided entropy source.
+Currently recognized sources: `hw_rng`, `jitterentropy`.

-Specify the font for the virtual console.  "9x16" and "18x32" are supported.
+## kernel.halt-on-panic=\<bool>
+If this option is set (disabled by default), the system will halt on
+a kernel panic instead of rebooting.
+
+## kernel.jitterentropy.bs=\<num>
+
+Sets the "memory block size" parameter for jitterentropy (the default is 64).
+When jitterentropy is performing memory operations (to increase variation in CPU
+timing), the memory will be accessed in blocks of this size.
+
+## kernel.jitterentropy.bc=\<num>
+
+Sets the "memory block count" parameter for jitterentropy (the default is 512).
+When jitterentropy is performing memory operations (to increase variation in CPU
+timing), this controls how many blocks (of size `kernel.jitterentropy.bs`) are
+accessed.
+
+## kernel.jitterentropy.ml=\<num>
+
+Sets the "memory loops" parameter for jitterentropy (the default is 32). When
+jitterentropy is performing memory operations (to increase variation in CPU
+timing), this controls how many times the memory access routine is repeated.
+This parameter is only used when `kernel.jitterentropy.raw` is true (otherwise,
+jitterentropy chooses the number of loops is a random-ish way).
+
+## kernel.jitterentropy.ll=\<num>
+
+Sets the "LFSR loops" parameter for jitterentropy (the default is 1). When
+jitterentropy is performing CPU-intensive LFSR operations (to increase variation
+in CPU timing), this controls how many times the LFSR routine is repeated.  This
+parameter is only used when `kernel.jitterentropy.raw` is true (otherwise,
+jitterentropy chooses the number of loops is a random-ish way).
+
+## kernel.jitterentropy.raw=\<bool>
+
+When true (the default), the jitterentropy entropy collector will return raw,
+unprocessed samples. When false, the raw samples will be processed by
+jitterentropy, producing output data that looks closer to uniformly random. Note
+that even when set to false, the CPRNG will re-process the samples, so the
+processing inside of jitterentropy is somewhat redundant.
+
+## kernel.memory-limit-mb=\<num>
+
+This option tells the kernel to limit system memory to the MB value specified
+by 'num'. Using this effectively allows a user to simulate the system having
+less physical memory than physically present.
+
+## kernel.oom.enable=\<bool>
+
+This option (true by default) turns on the out-of-memory (OOM) kernel thread,
+which kills processes when the PMM has less than `kernel.oom.redline_mb` free
+memory, sleeping for `kernel.oom.sleep_sec` between checks.
+
+The OOM thread can be manually started/stopped at runtime with the `k oom start`
+and `k oom stop` commands, and `k oom info` will show the current state.
+
+See `k oom` for a list of all OOM kernel commands.
+
+## kernel.oom.redline-mb=\<num>
+
+This option (50 MB by default) specifies the free-memory threshold at which the
+out-of-memory (OOM) thread will trigger a low-memory event and begin killing
+processes.
+
+The `k oom info` command will show the current value of this and other
+parameters.
+
+## kernel.oom.sleep-sec=\<num>
+
+This option (1 second by default) specifies how long the out-of-memory (OOM)
+kernel thread should sleep between checks.
+
+The `k oom info` command will show the current value of this and other
+parameters.
+
+## kernel.smp.maxcpus=\<num>
+
+This option caps the number of CPUs to initialize.  It cannot be greater than
+*SMP\_MAX\_CPUS* for a specific architecture.
+
+## kernel.smp.ht=\<bool>
+
+This option can be used to disable the initialization of hyperthread logical
+CPUs.  Defaults to true.
+
+## kernel.wallclock=\<name>
+
+This option can be used to force the selection of a particular wall clock.  It
+only is used on pc builds.  Options are "tsc", "hpet", and "pit".

 ## ktrace.bufsize

@@ -127,54 +217,19 @@ This option requests that the executable at *path* be launched once the
 system partition is mounted and *init* is launched.  If there is no system
 bootfs or system partition, it will never be launched.

-## kernel.oom.enable=\<bool>
+## magenta.system.writable=\<bool>

-This option (true by default) turns on the out-of-memory (OOM) kernel thread,
-which kills processes when the PMM has less than `kernel.oom.redline_mb` free
-memory, sleeping for `kernel.oom.sleep_sec` between checks.
+This option requests that if a minfs partition with the system type GUID is
+found, it is to be mounted read-write rather than read-only.

-The OOM thread can be manually started/stopped at runtime with the `k oom start`
-and `k oom stop` commands, and `k oom info` will show the current state.
+## netsvc.netboot=\<bool>

-See `k oom` for a list of all OOM kernel commands.
+If true, magenta will attempt to netboot into another instance of magenta upon
+booting.

-## kernel.oom.sleep-sec=\<num>
-
-This option (1 second by default) specifies how long the out-of-memory (OOM)
-kernel thread should sleep between checks.
-
-The `k oom info` command will show the current value of this and other
-parameters.
-
-## kernel.oom.redline-mb=\<num>
-
-This option (50 MB by default) specifies the free-memory threshold at which the
-out-of-memory (OOM) thread will trigger a low-memory event and begin killing
-processes.
-
-The `k oom info` command will show the current value of this and other
-parameters.
-
-## smp.maxcpus=\<num>
-
-This option caps the number of CPUs to initialize.  It cannot be greater than
-*SMP\_MAX\_CPUS* for a specific architecture.
-
-## smp.ht=\<bool>
-
-This option can be used to disable the initialization of hyperthread logical
-CPUs.  Defaults to true.
-
-## startup.keep-log-visible=\<bool>
-
-If this option is set, devmgr will not activate the first interactive
-console. It is useful for scenarios in which user input handling (and
-the ability to switch vcs) is not available. Defaults to false.
-
-## timer.wallclock=\<name>
-
-This option can be used to force the selection of a particular wall clock.  It
-only is used on pc builds.  Options are "tsc", "hpet", and "pit".
+More specifically, magenta will fetch a new magenta system from a bootserver on
+the local link and attempt to kexec into the new image, thereby replacing the
+currently running instance of magenta.

 ## userboot=\<path>

@@ -204,6 +259,23 @@ If this option is set, the `mx_ticks_get` and `mx_ticks_per_second` system
 calls will use `mx_time_get(MX_CLOCK_MONOTONIC)` in nanoseconds rather than
 hardware cycle counters in a hardware-based time unit.  Defaults to false.

+## virtcon.disable
+
+Do not launch the virtual console service if this option is present.
+
+## virtcon.keep-log-visible
+
+If this option is present, the virtual console service will keep the
+debug log (vc0) visible instead of switching to the first shell (vc1) at startup.
+
+## virtcon.keymap=\<name>
+
+Specify the keymap for the virtual console.  "qwerty" and "dvorak" are supported.
+
+## virtcon.font=\<name>
+
+Specify the font for the virtual console.  "9x16" and "18x32" are supported.
+
 # Additional Gigaboot Commandline Options

 ## bootloader.timeout=\<num>
@@ -0,0 +1,48 @@
+# Magenta Kernel Invariants
+
+On x86, Magenta needs to maintain the following invariants for code running
+in ring 0 (kernel mode).
+
+These invariants are documented here because they are not necessarily easy
+to test -- breaking an invariant will not necessarily be caught by
+Magenta's test suite.
+
+* Flags register:
+
+  * The direction flag (DF) should be 0.  This is required by the x86
+    calling conventions.
+
+    If this flag is set to 1, uses of x86 string instructions (e.g. `rep
+    movs` in `memcpy()` or inlined by the compiler) can go wrong and copy
+    in the wrong direction.  It is OK for a function to set this flag to 1
+    temporarily as long as it changes it back to 0 before returning or
+    calling other functions.
+
+  * The alignment check flag (AC) should normally be 0.  On CPUs that
+    support SMAP, this prevents the kernel from accidentally reading or
+    writing userland data.
+
+* The `gs_base` register must point to the current CPU's `x86_percpu`
+  struct whenever running in kernel mode with interrupts enabled.
+  `gs_base` should only be changed to point to something else while
+  interrupts are disabled.  For example, the `swapgs` instruction should
+  only be used when interrupts are disabled.
+
+* The following are usually enforced by the compiler:
+
+  * No use of extended registers (SSE, AVX, x87, etc.) is allowed, because
+    that would clobber userland's register state.
+
+    This is generally enforced by passing `-mno-sse` to the compiler.  That
+    option prevents accidentally using `float` or `double` types in kernel
+    code.  It is also necessary to prevent the compiler from using SSE
+    registers in optimizations (e.g. memory copies).
+
+  * No storing data below `%rsp` on the stack.  Note that userland code can
+    do this: the SysV x86-64 ABI allows functions to store data in the "red
+    zone", which is the 128 bytes below %rsp.  However, kernel code cannot
+    use the red zone because interrupts may clobber this region -- the CPU
+    pushes data onto the stack immediately below %rsp when it invokes an
+    interrupt handler.
+
+    This is generally enforced by passing `-mno-red-zone` to the compiler.
@@ -0,0 +1,26 @@
+# Magenta Makefile Options
+
+The following options can be passed to **make** when building Magenta:
+
+* **BOOTFS_DEBUG_MODULES**: See [debugging tips](debugging/tips.md).
+
+* **DEBUG**: This specifies the debug level.  The default is 2.  Setting
+**DEBUG=1** will disable some debugging code (such as **DEBUG_ASSERT()**),
+while setting **DEBUG=0** will disable more debugging code.
+
+* **ENABLE_ACPI_DEBUG**: See [ACPI debugging](debugging/acpi.md).
+
+* **GLOBAL_DEBUGFLAGS**: See [debugging tips](debugging/tips.md).
+
+* **GOMACC**: Path to the Goma compiler wrapper, **gomacc**, for use within
+Google for distributed builds.  The default is not to use Goma.
+
+* **USE_ASAN**: Set **USE_ASAN=1** to enable using ASan (the address
+sanitizer).
+
+* **USE_CLANG**: Set **USE_CLANG=1** to enable building with Clang.
+Otherwise, the default is to use GCC as the compiler.
+
+* **V**: Set **V=1** to tell the build system to print each command that
+**make** executes.  Otherwise, the build system only prints a short summary
+of each build step.
@@ -69,7 +69,7 @@ A kernel object is implemented as a C++ class that derives from `Dispatcher`
 and that overrides the methods it implements. Thus, for example, the code
 of the Thread object is found in `ThreadDispatcher`. There is plenty of
 code that only cares about kernel objects in the generic sense, in that case
-the name you'll see is `mxtl::RefPtr<Dispatcher>`.
+the name you'll see is `fbl::RefPtr<Dispatcher>`.

 ## Kernel Object security
 In principle, kernel objects do not have an intrinsic notion of security and
@@ -76,7 +76,7 @@ from the kernel, which our current implementation does not require.

    An in-depth tour of the locking primitives in WebKit, complete with
    benchmarks and analysis. Contains a detailed explanation of the "parking
-    lot" concept, which allows very compact representation of user-space
+    lot" concept, which allows very compact representation of userspace
    mutexes.

 ## SYSCALLS
@@ -14,10 +14,37 @@ only move data (not handles).
 Data is written into one end of a socket via *mx_socket_write* and
 read from the opposing end via *mx_socket_read*.

-Upon creation, both ends of the socket are writable and readable. Via
-the **MX_SOCKET_HALF_CLOSE** option to *mx_socket_write*, one end of
-the socket can be closed for reading (and the opposing end for
-writing).
+Upon creation, both ends of the socket are writable and readable. Via the
+**MX_SOCKET_SHUTDOWN_READ** and **MX_SOCKET_SHUTDOWN_WRITE** options to
+*mx_socket_write*, one end of the socket can be closed for reading and/or
+writing.
+
+## SIGNALS
+
+The following signals may be set for a socket object.
+
+**MX_SOCKET_READABLE** data is available to read from the socket
+
+**MX_SOCKET_WRITABLE** data may be written to the socket
+
+**MX_SOCKET_PEER_CLOSED** the other endpoint of this socket has
+been closed.
+
+**MX_SOCKET_READ_DISABLED** reading (beyond already buffered data) is disabled
+permanently for this endpoint either because of passing
+**MX_SOCKET_SHUTDOWN_READ** to this endpoint or passing
+**MX_SOCKET_SHUTDOWN_WRITE** to the peer. Reads on a socket endpoint with this
+signal raised will succeed so long as there is data in the socket that was
+written before reading was disabled.
+
+**MX_SOCKET_WRITE_DISABLED** writing is disabled permanently for this endpoing either
+because of passing **MX_SOCKET_SHUTDOWN_WRITE** to this endpoint or passing
+**MX_SOCKET_SHUTDOWN_READ** to the peer.
+
+**MX_SOCKET_CONTROL_READABLE** data is available to read from the
+socket control plane.
+
+**MX_SOCKET_CONTROL_WRITABLE** data may be written to the socket control plane.

 ## SYSCALLS

@@ -11,7 +11,7 @@ address space.

 ## DESCRIPTION

-VMARs are used by the kernel and user space to represent the allocation of an
+VMARs are used by the kernel and userspace to represent the allocation of an
 address space.

 Every process starts with a single VMAR (the root VMAR) that spans the entire
@@ -0,0 +1,416 @@
+# Magenta program loading and dynamic linking
+
+In Magenta, the kernel is not directly involved in normal program loading.
+(The one necessary exception is bootstrapping the userspace environment at
+system startup; see [`userboot`](userboot.md).)  Instead, the kernel merely
+provides the building blocks
+([VMO](objects/vm_object.md), [process](objects/process.md),
+[VMAR](objects/vm_address_region.md), [thread](objects/thread.md)) from
+which userspace program loading is built.
+
+[TOC]
+
+## ELF and the system ABI
+
+The standard Magenta userspace environment uses
+the [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format)
+format for machine-code executable files, and provides a dynamic linker and
+C/C++ execution environment that are based on ELF.  Magenta processes can
+use [system calls](syscalls.md) only via the [vDSO](vdso.md), which is
+provided by the kernel in ELF format and uses the C/C++ calling conventions
+common to ELF-based systems for the machine.  Userspace code (given the
+appropriate capabilities) can use the [system call](syscalls.md) building
+blocks directly to create processes and load programs into them without
+using ELF.  But Magenta's standard ABI for machine code uses ELF as
+described here.
+
+## Background: traditional ELF program loading
+
+[ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) was
+introduced with Unix System V Release 4 and became the common standard
+executable file format for most Unix-like systems, today including Linux and
+all the BSD variants as well as Solaris and many others.  In these systems,
+the kernel integrates program loading with filesystem access via the POSIX
+`execve` API.  There are some variations in how they load ELF programs, but
+most follow a pattern close to this:
+
+ 1. The kernel loads the file by name, and checks whether it's ELF or some
+    other kind of file that system supports.  This is where `#!` script
+    handling is done, as well non-ELF format support when present.
+ 2. The kernel maps the ELF image according to its `PT_LOAD` program
+    headers.  For an `ET_EXEC` file, this places the program's segments at
+    fixed addresses in memory specified in `p_vaddr`.  For an `ET_DYN`
+    file, the system chooses the base address where the program's first
+    `PT_LOAD` gets loaded, and following segments are placed according to
+    their `p_vaddr` relative to the first segment's `p_vaddr`.  Usually the
+    base address is chosen randomly (ASLR).
+ 3. If there was a `PT_INTERP` program header, its contents (a range of
+    bytes in the ELF file given by `p_offset` and `p_filesz`) is looked up
+    as a file name to find another ELF file called the *ELF interpreter*.
+    This must be an `ET_DYN` file; the kernel loads it in the same way as it
+    loaded the executable, but always at a location of its own choosing.
+    The interpreter program is usually the ELF dynamic linker with a name
+    like `/lib/ld.so.1` or `/lib/ld-linux.so.2`, but the kernel loads
+    whatever file is named.
+ 4. The kernel sets up the stack and registers for the initial thread, and
+    starts the thread running with the PC at the chosen entry point address.
+
+     * The entry point is the `e_entry` value from the ELF file header,
+       adjusted by base address.  When there was a `PT_INTERP`, the entry
+       point is that of the interepreter rather than the main executable.
+     * There is an assembly-level protocol of register and stack contents
+       that the kernel sets up for the program to receive its argument and
+       environment strings and an *auxiliary vector* of useful values.  When
+       there was a `PT_INTERP`, these include the base address, entry point,
+       and program header table address from the main executable's ELF
+       headers.  This information allows the dynamic linker to find the main
+       executable's ELF dynamic linking metadata in memory and do its work.
+       When dynamic linking startup is complete, the dynamic linker jumps to
+       the main executable's entry point address.
+
+Magenta program loading is inspired by this tradition, but does it somewhat
+differently.  A key reason for the traditional pattern of loading the
+executable before loading the dynamic linker is that the dynamic linker's
+randomly-chosen base address must not intersect with the fixed addresses
+used by an `ET_EXEC` executable file.  Magenta does not support
+fixed-address program loading (ELF `ET_EXEC` files) at all, only
+position-independent executables or *PIE*s, which are ELF `ET_DYN` files.
+
+## The **launchpad** library
+
+The main implementation of program loading resides in
+the [`launchpad` library](../system/ulib/launchpad/).  It has a C API
+in
+[`<launchpad/launchpad.h>`](../system/ulib/launchpad/include/launchpad/launchpad.h) but
+is not formally documented.  The `launchpad` API is not described here.  Its
+treatment of executable files and process startup forms the Magenta system
+ABI for program loading.
+The [lowest userspace layers of the system](userboot.md) implement the same
+protocols.  It's anticipated that in the future most process launching in
+the system will be done by a system service that uses `launchpad` in its
+implementation, rather than by direct use of the library.
+
+Filesystems are not part of the lower layers of Magenta API.  Instead,
+program loading is based on [VMOs](objects/vm_object.md) and on IPC
+protocols used via [channels](objects/channel.md).
+
+A program loading request starts with:
+
+ * a handle to a VMO containing the executable file (`MX_RIGHT_READ` and
+   `MX_RIGHT_EXECUTE` rights are required)
+ * a list of argument strings (to become `argv[]` in a C/C++ program)
+ * a list of environment strings (to become `environ[]` in a C/C++ program)
+ * a list of initial [handles](handles.md), each with
+   a [*handle info entry*](#handle-info-entry)
+
+Three types of file are handled:
+
+{#hashbang}
+* a script file starting with `#!`
+
+  The first line of the file starts with `#!` and must be no more than 127
+  characters long.  The first non-whitespace word following `#!` is the
+  *script interpreter name*.  If there's anything after that, it all
+  together becomes the *script interperter argument*.
+
+   * The script interpereter name is prepended to the original argument
+     list (to become `argv[0]`).
+   * If there was a script interpreter argument, it's inserted between the
+     interpreter name and the original argument list (to become `argv[1]`,
+     with the original `argv[0]` becoming `argv[2]`).
+   * The program loader looks up the script interpreter name via
+     the [loader service](#the-loader-service) to get a new VMO.
+   * Program loading restarts on that script interpreter VMO with the
+     modified argument list but everything else the same.  The VMO handle
+     for the original executable is just closed; the script interpreter only
+     gets the original `argv[0]` string to work with, not the original VMO.
+     There is a maximum nesting limit (currently 5) constraining how many
+     such restarts will be allowed before program loading just fails.
+
+* an ELF `ET_DYN` file with no `PT_INTERP`
+
+  * The system chooses a random base address for the first `PT_LOAD` segment
+    and then maps in each `PT_LOAD` segment relative to that base address.
+    This is done by creating a [VMAR](objects/vm_address_region.md) covering
+    the whole range from the first page of the first segment to the last
+    page of the last segment.
+  * A VMO is created and mapped at another random address to hold the stack
+    for the initial thread.  If there was a `PT_GNU_STACK` program header
+    with a nonzero `p_memsz`, that determines the size of the stack (rounded
+    up to whole pages).  Otherwise, a reasonable default stack size is used.
+  * The [vDSO](vdso.md) is mapped into the process
+    (another VMO containing an ELF image), also at a random base address.
+  * A new thread is created in the process with [**thread_create**()](syscalls/thread_create.md).
+  * A new [channel](objects/channel.md) is created, called the *bootstrap
+    channel*.  The program loader writes into this channel a message
+    in [the `processargs` protocol](#the-processargs-protocol) format. This
+    *bootstrap message* includes the argument and environment strings and
+    the initial handles from the original request.  That list is augmented
+    with handles for:
+
+     * the new [process](objects/process.md) itself
+     * its root [VMAR](objects/vm_address_region.md)
+     * its initial [thread](objects/thread.md)
+     * the VMAR covering where the executable was loaded
+     * the VMO just created for the stack
+     * optionally, a default [job](objects/job.md) so the new
+       process itself can create more processes
+     * optionally, the vDSO VMO so the new process can let the processes
+       it creates make system calls themselves
+
+    The program loader then closes its end of the channel.
+   * The initial thread is launched with
+     the [**process_start**() system call](syscalls/process_start.md):
+
+      * `entry` sets the new thread's PC to `e_entry` from the executable's
+        ELF header, adjusted by base address.
+      * `stack` sets the the new thread's stack pointer to the top of the
+        stack mapping.
+      * `arg1` transfers the handle to the *bootstrap channel* into the
+        first argument register in the C ABI.
+      * `arg2` passes the base address of the vDSO into the second argument
+        register in the C ABI.
+
+     Thus, the program entry point can be written as a C function:
+     ```c
+     noreturn void _start(mx_handle_t bootstrap_channel, uintptr_t vdso_base);
+     ```
+
+{#PT_INTERP}
+* an ELF `ET_DYN` file with a `PT_INTERP`
+
+  In this case, the program loader does not directly use the VMO containing
+  the ELF executable after reading its `PT_INTERP` header.  Instead, it
+  uses the `PT_INTERP` contents as the name of an *ELF interpreter*.  This
+  name is used in a request to the [loader service](#the-loader-service) to
+  get a new VMO containing the ELF interpeter, which is another ELF
+  `ET_DYN` file.  Then that VMO is loaded instead of the main executable's
+  VMO.  Startup is as described above, with these differences:
+
+   * An extra message
+     in [the `processargs` protocol](#the-processargs-protocol) is written
+     to the *bootstrap channel*, preceding the main bootstrap message.  The
+     ELF interpreter is expected to consume this *loader bootstrap message*
+     itself so that it can do its work, but then leave the second bootstrap
+     message in the channel and hand off the bootstrap channel handle to
+     the main program's entry point.  The *loader bootstrap message*
+     includes only the necessary handles added by the program loader, not
+     the full set that go into the main *bootstrap message*, plus these:
+
+      * the original VMO handle for main ELF executable
+      * a channel handle to the [loader service](#the-loader-service)
+
+     These allow the ELF interpreter to do its own loading of the
+     executable from the VMO and to use the loader service to get
+     additional VMOs for shared libraries to load.  The message also
+     includes the argument and environment strings, which lets the ELF
+     interpreter use `argv[0]` in its log messages, and check for
+     environment variables like `LD_DEBUG`.
+
+   * `PT_GNU_STACK` program headers are ignored.  Instead, the program
+     loader chooses a minimal stack size that is just large enough to
+     contain the *loader bootstrap message* plus some breathing room for
+     the ELF interpreter's startup code to use as call frames.  This
+     "breathing room" size is `PTHREAD_STACK_MIN` in the source, and is
+     tuned such that with a small bootstrap message size the whole stack is
+     only a single page, but a careful dynamic linker implementation has
+     enough space to work in.  The dynamic linker is expected to read the
+     main executable's `PT_GNU_STACK` and switch to a stack of reasonable
+     size for normal use before it jumps to the main executable's entry
+     point.
+
+*** aside
+
+The program loader chooses three randomly-placed chunks of the new
+process's address space before the program (or dynamic linker) gets
+control: the vDSO, the stack, and the dynamic linker itself.  To make it
+possible for the program's own startup to control its address space more
+fully, the program loader currently ensures that these random placements
+are always somewhere in the **upper half of the address space**.  This is
+for the convenience of sanitizer runtimes, which need to reserve some lower
+fraction of the address space.  This behavior will change in the future so
+there is some way to support the sanitizer cases but other processes will
+get fully random placement to maximize the benefits of ASLR.
+
+***
+
+## The **processargs** protocol
+
+[`<magenta/processargs.h>`](../system/public/magenta/processargs.h) defines
+the protocol for the *bootstrap message* sent on the *bootstrap channel* by
+the program loader.  When a process starts up, it has a handle to this
+bootstrap channel and it has access to [system calls](syscalls.md) via
+the [vDSO](vdso.md).  The process has only this one handle and so it can
+see only global system information and its own memory until it gets more
+information and handles via the bootstrap channel.
+
+The `processargs` protocol is a one-way protocol for messages sent on the
+bootstrap channel.  The new process is never expected to write back onto
+the channel.  The program loader usually sends its messages and then closes
+its end of the channel before the new process has even started.  These
+messages must communicate everything a new process will ever need, but the
+code that receives and decodes messages in this format must run in a very
+constrained environment.  Heap allocation is impossible and nontrivial
+library facilities may not be available.
+
+See the [header file](../system/public/magenta/processargs.h) for full
+details of the message format.  It's anticipated that this ad hoc protocol
+will be replaced with a formal IDL-based protocol eventually, but the
+format will be kept simple enough to be decoded by simple hand-written
+code.
+
+A bootstrap message conveys:
+
+ * a list of initial [handles](handles.md)
+ * a 32-bit *handle info entry* corresponding to each handle
+ * a list of name strings that a *handle info entry* can refer to
+ * a list of argument strings (to become `argv[]` in a C/C++ program)
+ * a list of environment strings (to become `environ[]` in a C/C++ program)
+
+{#handle-info-entry}
+The handles serve many purposes, indicated by the *handle info entry* type:
+
+ * essential handles for the process to make [system calls](syscalls.md):
+   [process](objects/process.md), [VMAR](objects/vm_address_region.md),
+   [thread](objects/thread.md), [job](objects/job.md)
+ * [channel](objects/channel.md) to the [loader service](#the-loader-service)
+ * [vDSO](vdso.md) [VMO](objects/vm_object.md)
+ * filesystem-related handles: current directory, file descriptors, name
+   space bindings (these encode an index into the list of name strings)
+ * special handles for system processes:
+   [resource](objects/resource.md), [VMO](objects/vm_object.md)
+ * other types used for higher-layer or private protocol purposes
+
+Most of these are just passed through by the program loader,
+which does not need to know what they're for.
+
+## The **loader service**
+
+In dynamic linking systems, an executable file refers to and uses at
+runtime additional files containing shared libraries and plugins.  The
+dynamic linker is loaded as an [*ELF interperter*](#PT_INTERP) and is
+responsible getting access to all these additional files to complete
+dynamic linking before the main program's entry point gets control.
+
+All of Magenta's standard userspace uses dynamic linking, down to the very
+first process loaded by [`userboot`](userboot.md).  Device drivers and
+filesystems are implemented by userspace programs loaded this way.  So
+program loading cannot be defined in terms of higher-layer abstractions
+such as a filesystem paradigm,
+as
+[traditional systems have done](#background_traditional-elf-program-loading).
+Instead, program loading is based only on [VMOs](objects/vm_object.md) and
+a simple [channel](objects/channel.md)-based protocol.
+
+This *loader service* protocol is how a dynamic linker acquires VMOs
+representing the additional files it needs to load as shared libraries.
+
+This is a simple RPC protocol, defined in
+[`<magenta/processargs.h>`](../system/public/magenta/processargs.h).
+As with [the `processargs` protocol](#the-processargs-protocol),
+it's anticipated that this ad hoc protocol will be replaced with a formal
+IDL-based protocol eventually, but the format will be kept simple enough to
+be decoded by simple hand-written code.  The code sending loader service
+requests and receiving their replies during dynamic linker startup may
+not have access to nontrivial library facilities.
+
+An ELF interpreter receives a channel handle for its loader service in its
+`processargs` bootstrap message, identified by the *handle info entry*
+`PA_HND(PA_SVC_LOADER, 0)`.  All requests are synchronous RPCs made
+with [**channel_call**()](syscalls/channel_call.md).  Both requests and
+replies start with the `mx_loader_svc_msg_t` header; some contain
+additional data; some contain a VMO handle.  Request opcodes are:
+
+ * `LOADER_SVC_OP_LOAD_SCRIPT_INTERP`: *string* -> *VMO handle*
+
+   The program loader sends the *script interperter name* from
+   a [`#!` script](#hashbang) and gets back a VMO to execute in place of
+   the script.
+
+ * `LOADER_SVC_OP_LOAD_OBJECT`: *string* -> *VMO handle*
+
+   The dynamic linker sends the name of an *object* (shared library or
+   plugin) and gets back a VMO handle containing the file.
+
+ * `LOADER_SVC_OP_CONFIG` : *string* -> `reply ignored`
+
+   The dynamic linker sends a string identifying its *load configuration*.
+   This is intended to affect how later `LOADER_SVC_OP_LOAD_OBJECT`
+   requests decide what particular implementation file to supply for a
+   given name.
+
+ * `LOADER_SVC_OP_DEBUG_PRINT`: *string* -> `reply ignored`
+
+   This is a simple ad hoc logging facility intended for debugging the
+   dynamic linker and early program startup issues.  It's convenient
+   because the early startup code is using the loader service but doesn't
+   have access to many other handles or complex facilities yet.  This will
+   be replaced in the future with some simple-to-use logging facility that
+   does not go through the loader service.
+
+ * `LOADER_SVC_OP_LOAD_DEBUG_CONFIG`: *string* -> *VMO handle*
+
+   **This is intended to be a developer-oriented feature and might not
+   ordinarily be available in production runs.**
+
+   The program runtime sends a string naming a *debug configuration* of
+   some kind and gets back a VMO to read configuration data from.  The
+   sanitizer runtimes use this to allow large options text to be stored in
+   a file rather than passed directly in environment strings.
+
+ * `LOADER_SVC_OP_PUBLISH_DATA_SINK`: *string*, *VMO handle* -> `reply ignored`
+
+   **This is intended to be a developer-oriented feature and might not
+   ordinarily be available in production runs.**
+
+   The program runtime sends a string naming a *data sink* and transfers
+   the sole handle to a VMO it wants published there.  The *data sink*
+   string identifies a type of data, and the VMO's object name can
+   specifically identify the data set in this VMO.  The client must
+   transfer the only handle to the VMO (which prevents the VMO being
+   resized without the receiver's knowledge), but it might still have the
+   VMO mapped in and continue to write data to it.  Code instrumentation
+   runtimes use this to deliver large binary trace results.
+
+## Magenta's standard ELF dynamic linker
+
+The ELF conventions described above and
+the [`processargs`](#the-processargs-protocol)
+and [loader service](#the-loader-service) protocols are the permanent
+system ABI for program loading.  Programs can use any implementation of a
+machine code executable that meets the basic ELF format conventions.  The
+implementation can use the the [vDSO](vdso.md) [system call](syscalls.md)
+ABI, the `processargs` data, and the loader service facilities as it sees
+fit.  The exact details of what handles and data they will receive via
+these protocols depend on the higher-layer program environment.  Magenta's
+system processes use an ELF interpreter that implements basic ELF dynamic
+linking, and a simple implementation of the loader service.
+
+Magenta's standard C library and dynamic linker have
+a [unified implementation](../third_party/ulib/musl/) originally derived
+from [`musl`](http://www.musl-libc.org/).  It's identified by the
+`PT_INTERP` string `ld.so.1`.  It uses the `DT_NEEDED` strings naming
+shared libraries as [loader service](#the-loader-service) *object* names.
+
+The simple loader service maps requests into filesystem access:
+ * *script interperter* and *debug configuration* names must start with `/`
+   and are used as absolute file names.
+ * *data sink* names become subdirectories in `/tmp`, and each VMO
+   published becomes a file in that subdirectory with the VMO's object name
+ * *object* names are searched for as files in system `lib/` directories.
+ * *load configuration* strings are taken as a subdirectory name,
+   optionally preceded by a `!` character.  Subdirectories by that name in
+   system `lib/` directories searched are searched before `lib/` itself.
+   If there was a `!` prefix, *only* those subdirecotries are searched.
+   For example, sanitizer runtimes use `asan` because that instrumentation
+   is compatible with uninstrumented library code, but `!dfsan` because
+   that instrumentation requires that all code in the process be
+   instrumented.
+
+A version of the standard runtime instrumented with
+LLVM [AddressSanitizer](https://clang.llvm.org/docs/AddressSanitizer.html)
+is identified by the `PT_INTERP` string `asan/ld.so.1`.  This version sends
+the *load configuration* string `asan` before loading shareed libraries.
+When [SanitizerCoverage](https://clang.llvm.org/docs/SanitizerCoverage.html)
+is enabled, it publishes a VMO to the *data sink* name `sancov` and uses a
+VMO name including the process KOID.
@@ -0,0 +1,144 @@
+# Static Analysis in Magenta
+
+This document describes:
+
+* How to perform static analysis with the Clang Static Analyzer in Magenta;
+* How to enable MagentaHandleChecker;
+* How to add/modify annotate attributes to syscalls/functions and use annotate attributes to suppress false positives.
+
+## Steps to run Clang Static Analyzer
+
+Assuming you already obtained a local copy of Fuchsia workspace according to the instructions written in [get_source.md](https://fuchsia.googlesource.com/docs/+/master/getting_source.md) and the source tree of fuchsia is located at `$LOCAL_DIR/fuchsia` and current working directory is `$LOCAL_DIR/fuchsia/magenta`. The Clang Static Analayzer can be run on Magenta by following commands:
+
+```sh
+./scripts/download-toolchain
+./scripts/analyze-magenta
+```
+
+The Clang Static Analyzer will be run on Magenta code base with default checkers. After the finish of the analysis, you can see an outout in stdout similar to the one below:
+
+```
+scan-build: Run 'scan-view $LOCAL_DIR/fuchsia/magenta/AnalysisResult/scan-build-2017-08-08-11-26-25-914570-SKSE39' to examine bug reports.
+```
+
+Just type the command start with `scan-view` in a terminal and it will open your web browser and show the analysis reports.
+
+## Steps to enable MagentaHandleChecker
+
+At the time this document is written, all Magenta related checkers are still under review by upstream LLVM community:
+
+ * MutexInInterruptContext [D27854](https://reviews.llvm.org/D27854)
+ * SpinLockChecker [D26340](https://reviews.llvm.org/D26340)
+ * MutexChecker [D26342](https://reviews.llvm.org/D26342)
+ * MagentaHandleChecker [D35968](https://reviews.llvm.org/D35968) [D36022](https://reviews.llvm.org/D36022) [D36023](https://reviews.llvm.org/D36023) [D36024](https://reviews.llvm.org/D36024) [D36251](https://reviews.llvm.org/D36251) [D36475](https://reviews.llvm.org/D36475))
+
+They are enabled by default when you executed the 'analyze-magenta' script. We will update the 'analyze-magenta' script to enable them by default once they get landed.
+
+In the mean time, if you would like to try MagentaHandleChecker now, you can download the source code of LLVM with Clang and apply the patch from the diffs above and follow the instructions in [toolchain.md](https://fuchsia.googlesource.com/docs/+/master/toolchain.md) to build your own toolchain. Assuming you have built your own toolchain and it is located at `$LOCAL_TOOLCHAIN_PREFIX` and `$LOCAL_TOOLCHAIN_PREFIX/bin/clang` is the path to the `clang` command. The Clang Static Analyzer can be run with MagentaHandleChecker and other default checkers enabled by following command:
+
+```
+./scripts/analyze-magenta -p $LOCAL_TOOLCHAIN_PREFIX -m all
+```
+
+If you want to enable MagentaHandleChecker and disable other default checkers, please run following command:
+
+```
+./scripts/analyze-magenta -p $LOCAL_TOOLCHAIN_PREFIX -m magenta
+```
+
+The 'analyze-magenta' scripts have additional options such as changing the output directories and changing build targets, please refer the to help information printed by `./scripts/analyze-magenta -h`.
+
+## Steps to add/modify annotate attributes to syscalls/functions
+
+In Magenta code base, raw annotations like `__attribute__((annotate("string")))` should never be used in Magenta code base, all magenta related annotations should be wrapped by macros. In this section, we will discuss how to add or modify annotations in Magenta code base.
+
+### Annotations in syscall declaration
+
+As header files of Magenta syscalls are generated from syscalls.sysgen, in order to add/modify annotations of syscalls, the syscalls.sysgen should be modified directly.
+Let’s use `mx_channel_create syscall` as example. This syscall will allocate two handles when it is successfully executed. Without annotations, its declaration in sysgen will be like:
+
+```c
+syscall channel_create
+    (options: uint32_t)
+    returns (mx_status_t, out0: mx_handle_t, out1: mx_handle_t);
+```
+
+As argument `out0` and `out1` will be allocated handles, we should add `handle_acquire` annotation to these arguments:
+
+```c
+syscall channel_create
+    (options: uint32_t)
+    returns (mx_status_t, out0: mx_handle_t handle_acquire,
+             out1: mx_handle_t handle_acquire);
+```
+
+This syscall declaration will be processed by sysgen and converted to:
+
+```c
+extern mx_status_t mx_channel_create(
+uint32_t options,
+    MX_SYSCALL_PARAM_ATTR(handle_acquire) mx_handle_t* out0,
+    MX_SYSCALL_PARAM_ATTR(handle_acquire) mx_handle_t* out1));
+```
+
+The declaration of macro can be found in system/public/magenta/syscalls.h, which is:
+
+```c
+#if defined(__clang__)
+#define MX_SYSCALL_PARAM_ATTR(x)   __attribute__((annotate("mx_" #x)))
+#else
+#define MX_SYSCALL_PARAM_ATTR(x)   // no-op
+#endif
+```
+
+According to the definition of `MX_SYSCALL_PARAM_ATTR`, the `mx_channel_create` will be parsed into:
+
+```c
+extern mx_status_t mx_channel_create(uint32_t options,
+__attribute__((annotate("mx_handle_acquire"))) mx_handle_t* out0,
+__attribute__((annotate("mx_handle_acquire"))) mx_handle_t* out1) __attribute__((__leaf__));;
+```
+
+The reason that we use macros to wrap these annotations is that annotate attribute is not supported by compilers other than Clang, e.g. GCC. Furthermore, it would be convenient if we decide to use annotation solutions other than the annotate attributes in the future. Otherwise we need to change each annotation one by one.
+
+### Annotations in other functions
+
+For functions other than syscalls, if `system/public/magenta/syscalls.h` is in current include path, you can use `MX_SYSCALL_PARAM_ATTR` macro to wrap your annotations. If not, you should use macros similar to this one. The reason that functions other than syscalls may require annotations is that some functions contain known false positives and we can use annotation to suppress the warnings of these false positives. For example, in MagentaHandleChecker’s test file we have:
+
+```c
+#if defined(__clang__)
+#define MX_ANALYZER_SUPPRESS   __attribute__((annotate("mx_suppress_warning)))
+#else
+#define MX_ANALYZER_SUPPRESS   // no-op
+#endif
+void checkSuppressWarning() MX_ANALYZER_SUPPRESS {
+  mx_handle_t sa, sb;
+  if (mx_channel_create(0, &sa, &sb) < 0) {
+    return;
+  }
+  mx_handle_close(sa); // Should not report any bugs here
+}
+```
+
+The analyzer will suppress the warnings on the bug it discovered in `checkSuppressWarning` function. If you don’t want to define your own macro for this purpose, and the `syscalls.h` is in the include path, you can use `_SYSCALL_PARAM_ATTR(suppress_warning)` instead, it will suppress the warnings of all bugs discovered in the functions with this annotation.
+
+Similar to `mx_suppress_warning` annotation, we have `mx_create_sink` annotation which currently used to suppress warnings on assertion failures. This annotation is unlikely to be used for other purpose, however, if you would like to know how it works, please refer to the discussions in CL[46428](https://fuchsia-review.googlesource.com/c/46428).
+
+To manually annotate non-syscall functions, the "MX_SYSCALL_PARAM_ATTR" macro can be applied to function arguments, emulating the effect of the sysgen attributes. For example, here, we annotate a regular function which might be used to call the "mx_create_channel" function without passing the "options" argument:
+
+```c
+mx_status_t create_channel(
+  MX_SYSCALL_PARAM_ATTR(handle_acquire) mx_handle_t* out0,
+  MX_SYSCALL_PARAM_ATTR(handle_acquire) mx_handle_t* out1);
+```
+Another example, we have another function `takeover_handle` that will take care the lifecycle of a handle if it is successfully executed and do nothing if it failed, we can declare this function in header file like this:
+
+```c
+mx_status_t takeover_handle(
+  MX_SYSCALL_PARAM_ATTR(handle_escape) mx_handle_t in)
+  MX_SYSCALL_PARAM_ATTR(may_fail);
+```
+
+The `mx_may_fail` annotation here will cause state bifurcation when MagentaHandleChecker is evaluating calls to this function. So both succeeded and failed states will be covered.
+
+If the `MX_SYSCALL_PARAM_ATTR` is not available in the file that declares the function, you can define your own macros, as long as it will not expanded into annotate attribute if it is not compiled by Clang.
@@ -0,0 +1,372 @@
+# Symbolizer markup format #
+
+This document defines a text format for log messages that can be
+processed by a _symbolizing filter_.  The basic idea is that logging
+code emits text that contains raw address values and so forth, without
+the logging code doing any real work to convert those values to
+human-readable form.  Instead, logging text uses the markup format
+defined here to identify pieces of information that should be converted
+to human-readable form after the fact.  As with other markup formats,
+the expectation is that most of the text will be displayed as is, while
+the markup elements will be replaced with expanded text, or converted
+into active UI elements, that present more details in symbolic form.
+
+This means there is no need for symbol tables, DWARF debugging sections,
+or similar information to be directly accessible at runtime.  There is
+also no need at runtime for any logic intended to compute human-readable
+presentation of information, such as C++ symbol demangling.  Instead,
+logging must include markup elements that give the contextual
+information necessary to make sense of the raw data, such as memory
+layout details.
+
+This format identifies markup elements with a syntax that is both simple
+and distinctive.  It's simple enough to be matched and parsed with
+straightforward code.  It's distinctive enough that character sequences
+that look like the start or end of a markup element should rarely if
+ever appear incidentally in logging text.  It's specifically intended
+not to require sanitizing plain text, such as the HTML/XML requirement
+to replace `<` with `&lt;` and the like.
+
+## Scope and assumptions ##
+
+This specification defines a format standard for Magenta and Fuchsia.
+But there is nothing specific to Magenta or Fuchsia about the markup
+format.  A symbolizing filter implementation will be independent both of
+the _target_ operating system and machine architecture where the logs
+are generated and of the _host_ operating system and machine
+architecture where the filter runs.
+
+This format assumes that the symbolizing filter processes intact whole
+lines.  If long lines might be split during some stage of a logging
+pipeline, they must be reassembled to restore the original line breaks
+before feeding lines into the symbolizing filter.  Most markup elements
+must appear entirely on a single line (often with other text before
+and/or after the markup element).  There are some markup elements that
+are specified to span lines, with line breaks in the middle of the
+element.  Even in those cases, the filter is not expected to handle line
+breaks in arbitrary places inside a markup element, but only inside
+certain fields.
+
+This format assumes that the symbolizing filter processes a coherent
+stream of log lines from a single process address space context.  If a
+logging stream interleaves log lines from more than one process, these
+must be collated into separate per-process log streams and each stream
+processed by a separate instance of the symbolizing filter.  Because the
+kernel and user processes use disjoint address regions in most operating
+systems (including Magenta), a single user process address space plus
+the kernel address space can be treated as a single address space for
+symbolization purposes if desired.
+
+## Dependence on Build IDs ##
+
+The symbolizer markup scheme relies on contextual information about
+runtime memory address layout to make it possible to convert markup
+elements into useful symbolic form.  This relies on having an
+unmistakable identification of which binary was loaded at each address.
+
+An ELF Build ID is the payload of an ELF note with name `"GNU"` and type
+`NT_GNU_BUILD_ID`, a unique byte sequence that identifies a particular
+binary (executable, shared library, loadable module, or driver module).
+The linker generates this automatically based on a hash that includes
+the complete symbol table and debugging information, even if this is
+later stripped from the binary.
+
+This specification uses the ELF Build ID as the sole means of
+identifying binaries.  Each binary relevant to the log must have been
+linked with a unique Build ID.  The symbolizing filter must have some
+means of mapping a Build ID back to the original ELF binary (either the
+whole unstripped binary, or a stripped binary paired with a separate
+debug file).
+
+## Colorization ##
+
+The markup format supports a restricted subset of ANSI X3.64 SGR (Select
+Graphic Rendition) control sequences.  These are unlike other markup
+elements:
+ * They specify presentation details (**bold** or colors) rather than
+   semantic information.  The assocation of semantic meaning with color
+   (e.g. red for errors) is chosen by the code doing the logging, rather
+   than by the UI presentation of the symbolizing filter.  This is a
+   concession to existing code (e.g. LLVM sanitizer runtimes) that use
+   specific colors and would require substantial changes to generate
+   semantic markup instead.
+ * A single control sequence changes "the state", rather than being an
+   hierarchical structure that surrounds affected text.
+
+The filter processes ANSI SGR control sequences only within a single
+line.  If a control sequence to enter a **bold** or color state is
+encountered, it's expected that the control sequence to reset to default
+state will be encountered before the end of that line.  If a "dangling"
+state is left at the end of a line, the filter may reset to default
+state for the next line.
+
+An SGR control sequence is not interpreted inside any other markup element.
+However, other markup elements may appear between SGR control sequences and
+the color/**bold** state is expected to apply to the symbolic output that
+replaces the markup element in the filter's output.
+
+The accepted SGR control sequences all have the form `"\033[%um"`
+(expressed here using C string syntax), where `%u` is one of these:
+
+| Code | Effect | Notes |
+|:----:|:------:|-------|
+| `0`  | Reset to default formatting. | |
+| `1`  | Use **bold text**  | Combines with color states, doesn't reset them.|
+| `30` | Black foreground   | |
+| `31` | Red foreground     | |
+| `32` | Green foreground   | |
+| `33` | Yellow foreground  | |
+| `34` | Blue foreground    | |
+| `35` | Magenta foreground | |
+| `36` | Cyan foreground    | |
+| `37` | White foreground   | |
+
+## Common markup element syntax ##
+
+All the markup elements share a common syntactic structure to facilitate
+simple matching and parsing code.  Each element has the form:
+
+```
+{{{tag:fields}}}
+```
+
+`tag` identifies one of the element types described below, and is always
+a short alphabetic string that must be in lower case.  The rest of the
+element consists of one or more fields.  Fields are separated by `:` and
+cannot contain any `:` or `}` characters.  How many fields must be or
+may be present and what they contain is specified for each element type.
+
+No markup elements or ANSI SGR control sequences are interpreted inside the
+contents of a field.
+
+In the descriptions of each element type, `printf`-style placeholders
+indicate field contents:
+
+* `%s`
+
+  A string of printable characters, not including `:` or `}`.
+
+* `%p`
+
+  An address value represented by `0x` followed by an even number of
+  hexadecimal digits (using either lower-case or upper-case for
+  `A`..`F`).  If the digits are all `0` then the `0x` prefix may be
+  omitted.  No more than 16 hexadecimal digits are expected to appear in
+  a single value (64 bits).
+
+* `%u`
+
+  A nonnegative decimal integer.
+
+* `%x`
+
+  A sequence of an even number of hexadecimal digits (using either
+  lower-case or upper-case for `A`..`F`), with no `0x` prefix.
+  This represents an arbitrary sequence of bytes, such as an ELF Build ID.
+
+## Presentation elements ##
+
+These are elements that convey a specific program entity to be displayed
+in human-readable symbolic form.
+
+* `{{{symbol:%s}}}`
+
+  Here `%s` is the linkage name for a symbol or type.  It may require
+  demangling according to language ABI rules.  Even for unmangled names,
+  it's recommended that this markup element be used to identify a symbol
+  name so that it can be presented distinctively.
+
+  Examples:
+  ```
+  {{{symbol:_ZN7Mangled4NameEv}}}
+  {{{symbol:foobar}}}
+  ```
+
+* `{{{pc:%p}}}`
+
+  Here `%p` is the memory address of a code location.
+  It might be presented as a function name and source location.
+
+  Examples:
+  ```
+  {{{pc:0x12345678}}}
+  {{{pc:0xffffffff9abcdef0}}}
+  ```
+
+* `{{{data:%p}}}`
+
+  Here `%p` is the memory address of a data location.
+  It might be presented as the name of a global variable at that location.
+
+  Examples:
+  ```
+  {{{data:0x12345678}}}
+  {{{data:0xffffffff9abcdef0}}}
+  ```
+
+* `{{{bt:%u:%p}}}`
+
+  This represents one frame in a backtrace.  It usually appears on a
+  line by itself (surrounded only by whitespace), in a sequence of such
+  lines with ascending frame numbers.  So the human-readable output
+  might be formatted assuming that, such that it looks good for a
+  sequence of `bt` elements each alone on its line with uniform
+  indentation of each line.  But it can appear anywhere, so the filter
+  should not remove any non-whitespace text surrounding the element.
+
+  Here `%u` is the frame number, which starts at zero for the location
+  of the fault being identified, increments to one for the caller of
+  frame zero's call frame, to two for the caller of frame one, etc.
+  `%p` is the memory address of a code location.
+
+  In frames after frame zero, this code location identifies a call site.
+  Some emitters may subtract one byte or one instruction length from the
+  actual return address for the call site, with the intent that the
+  address logged can be translated directly to a source location for the
+  call site and not for the apparent return site thereafter (which can
+  be confusing).  It's recommended that emitters _not_ do this, so that
+  each frame's code location is the exact return address given to its
+  callee and e.g. could be highlighted in instruction-level disassembly.
+  The symbolizing filter can do the adjustment to the address it
+  translates into a source location.  Assuming that a call instruction
+  is longer than one byte on all supported machines, applying the
+  "subtract one byte" adjustment a second time still results in an
+  address somewhere in the call instruction, so a little sloppiness here
+  does no harm.
+
+  Examples:
+  ```
+  {{{bt:0:0x12345678}}}
+  {{{bt:1:0xffffffff9abcdef0}}}
+  ```
+
+* `{{{hexdict:...}}}`
+
+  This element can span multiple lines.  Here `...` is a sequence of
+  key-value pairs where a single `:` separates each key from its value,
+  and arbitrary whitespace separates the pairs.  The value (right-hand
+  side) of each pair either is one or more `0` digits, or is `0x`
+  followed by hexadecimal digits.  Each value might be a memory address
+  or might be some other integer (including an integer that looks like a
+  likely memory address but actually has an unrelated purpose).  When
+  the contextual information about the memory layout suggests that a
+  given value could be a code location or a global variable data
+  address, it might be presented as a source location or variable name
+  or with active UI that makes such interpretation optionally visible.
+
+  The intended use is for things like register dumps, where the emitter
+  doesn't know which values might have a symbolic interpretation but a
+  presentation that makes plausible symbolic interpretations available
+  might be very useful to someone reading the log.  At the same time,
+  a flat text presentation should usually avoid interfering too much
+  with the original contents and formatting of the dump.  For example,
+  it might use footnotes with source locations for values that appear
+  to be code locations.  An active UI presentation might show the dump
+  text as is, but highlight values with symbolic information available
+  and pop up a presentation of symbolic details when a value is selected.
+
+  Example:
+  ```
+  {{{hexdict:
+    CS:                   0 RIP:     0x6ee17076fb80 EFL:            0x10246 CR2:                  0
+    RAX:      0xc53d0acbcf0 RBX:     0x1e659ea7e0d0 RCX:                  0 RDX:     0x6ee1708300cc
+    RSI:                  0 RDI:     0x6ee170830040 RBP:     0x3b13734898e0 RSP:     0x3b13734898d8
+     R8:     0x3b1373489860  R9:         0x2776ff4f R10:     0x2749d3e9a940 R11:              0x246
+    R12:     0x1e659ea7e0f0 R13: 0xd7231230fd6ff2e7 R14:     0x1e659ea7e108 R15:      0xc53d0acbcf0
+  }}}
+  ```
+
+* `{{{dumpfile:%s:%s}}}`
+
+  Here the first `%s` is an identifier for a type of dump and the
+  second `%s` is an identifier for a particular dump that's just been
+  published.  The types of dumps, the exact meaning of "published",
+  and the nature of the identifier are outside the scope of the markup
+  format per se.  In general it might correspond to writing a file by
+  that name or something similar.
+
+  This is technically a presentation element, but it may also serve to
+  trigger additional post-processing work beyond symbolizing the markup.
+  It indicates that a dump file of some sort has been published.  Some
+  logic attached to the symbolizing filter may understand certain types
+  of dump file and trigger additional post-processing of the dump file
+  upon encountering this element (e.g. generating visualizations,
+  symbolization).  The expectation is that the information collected
+  from contextual elements (described below) in the logging stream may
+  be necessary to decode the content of the dump.  So if the symbolizing
+  filter triggers other processing, it may need to feed some distilled
+  form of the contextual information to those processes.
+
+  On Magenta and Fuchsia in particular, "publish" means to call the
+  `__sanitizer_publish_data` function from `<magenta/sanitizer.h>`
+  with the "type" identifier as the "sink name" string.  The "dump
+  identifier" is the name attached to the Magenta VMO whose handle
+  was passed in the call to `__sanitizer_publish_data`.
+  **TODO(mcgrathr): Link to docs about `__sanitizer_publish_data` and
+  getting data dumps off the device.**
+
+  An example of a type identifier is `sancov`, for dumps from LLVM
+  [SanitizerCoverage](https://clang.llvm.org/docs/SanitizerCoverage.html).
+
+  Example:
+  ```
+  {{{dumpfile:sancov:sancov.8675}}}
+  ```
+
+## Contextual elements ##
+
+These are elements that supply information necessary to convert
+presentation elements to symbolic form.  Unlike presentation elements,
+they are not directly related to the surrounding text.  Contextual
+elements should appear alone on lines with no other non-whitespace
+text, so that the symbolizing filter might elide the whole line from
+its output without hiding any other log text.
+
+The contextual elements themselves do not necessarily need to be
+presented in human-readable output.  However, the information they
+impart may be essential to understanding the logging text even after
+symbolization.  So it's recommended that this information be preserved
+in some form when the original raw log with markup may no longer be
+readily accessible for whatever reason.
+
+Contextual elements should appear in the logging stream before they are
+needed.  That is, if some piece of context may affect how the
+symbolizing filter would interpret or present a later presentation
+element, the necessary contextual elements should have appeared
+somewhere earlier in the logging stream.  It should always be possible
+for the symbolizing filter to be implemented as a single pass over the
+raw logging stream, accumulating context and massaging text as it goes.
+
+* `{{{module:%x:%s:...}}}`
+
+  Here `%x` encodes an ELF Build ID (or equivalent unique identifier).
+  The `%s` is a human-readable identifier for the module, such as an ELF
+  `DT_SONAME` string or a file name; but it might be empty.  It's only
+  for casual information.  The Build ID string is the sole way to
+  identify the binary from which this module was loaded.  A "module" is
+  a single linked binary, such as a loaded ELF file.  Usually each
+  module occupies a contiguous range of memory (always does on Magenta).
+
+  The `...` is a sequence of fields (separated by `:`) that each
+  describe a range of memory, called a _segment_.  The field for each
+  segment has the form `%p,%p,%s` which can be read as: address, size,
+  flags.  The first `%p` is the starting address of the segment and
+  the second `%p` is its size in bytes.  The starting address will
+  usually have been rounded down to the active page size, and the size
+  rounded up.  The `%s` is one or more of the letters 'r', 'w', and
+  'x' (in that order and in either upper or lower case) to indicate
+  this segment of memory is readable, writable, and/or executable.
+  The symbolizing filter can use this information to guess whether an
+  address is a likely code address or a likely data address in the
+  given module.
+
+  There can be any number of segments (within reason), but there must
+  be at least one.  They must be in ascending order of address and
+  must not overlap.  For an ELF module, the segments should correspond
+  exactly to the `PT_LOAD` segments in the ELF file's program headers
+  (except for address and size rounding).
+
+  Example:
+  ```
+  {{{module:83238ab56ba10497:libc.so:0x7acba69d5000,0x5a000,r:0x7acba6a2f000,0x7e000,rx:0x7acba6aad000,0x8000,rw}}}
+  ```
@@ -101,7 +101,7 @@

 ## Timers
 + [timer_create](syscalls/timer_create.md) - create a timer object
-+ [timer_start](syscalls/timer_start.md) - start a timer
+ [timer_set](syscalls/timer_set.md) - start a timer
 + [timer_cancel](syscalls/timer_cancel.md) - cancel a timer

 ## Global system information
@@ -33,6 +33,27 @@ not a valid userspace pointer.
 There are no other error conditions.  If its arguments are valid,
 **mx_cprng_draw**() will succeed.

+## EXAMPLES
+
+```
+// Draw |len| bytes of cryptographically secure random data into |buf|.
+// It is not recommended to call this with large lengths.  If you need many
+// bytes, you likely want a usermode CPRNG seeded by this function.
+mx_status_t draw(char* buf, size_t len) {
+    // This loop is necessary to deal with short reads from the kernel.
+    while (len > 0) {
+        size_t actual;
+        mx_status_t status = mx_cprng_draw(buf, min(len, MX_CPRNG_DRAW_MAX_LEN), &actual);
+        if (status != MX_OK) {
+            return status;
+        }
+        buf += actual;
+        len -= actual;
+    }
+    return MX_OK;
+}
+```
+
 ## BUGS

 This syscall should be rate-limited.
@@ -8,33 +8,39 @@ guest_set_trap - sets a trap within a guest

 ```
 #include <magenta/syscalls.h>
-#include <magenta/syscalls/hypervisor.h>
+#include <magenta/syscalls/port.h>

 mx_status_t mx_guest_set_trap(mx_handle_t guest, uint32_t kind, mx_vaddr_t addr,
-                              size_t len, mx_handle_t fifo);
+                              size_t len, mx_handle_t port, uint64_t key);
 ```

 ## DESCRIPTION

 **guest_set_trap**() sets a trap within a guest, which generates a packet when
-when there is an access by a VCPU within the address range defined by *addr* and
+there is an access by a VCPU within the address range defined by *addr* and
 *len*, within the address space defined by *kind*.

-If *fifo* is specified, a *mx_guest_packet_t* packet for the trap will be
-delivered through the FIFO, otherwise if *MX_HANDLE_INVALID* is given, the
-packet will be delivered through **vcpu_resume**(). This provides control over
-whether the packet is delivered asynchronously, or synchronously.
+If *port* is specified, a packet with a *key* for the trap will be delivered
+through the port each time it is triggered, otherwise if *MX_HANDLE_INVALID* is
+given, the packet will be delivered through **vcpu_resume**() and a key of 0
+will be set. This provides control over whether the packet is delivered
+asynchronously or synchronously, and provides the ability to distinguish packets
+multiplexed onto the same port.

-If *fifo* is full, execution of the VCPU that caused the trap will be paused.
-When the FIFO is no longer full, execution of the VCPU will resume.
+When *port* is specified, a fixed number of packets are pre-allocated per trap.
+If all the packets are exhausted, execution of the VCPU that caused the trap
+will be paused. When at least one packet is dequeued, execution of the VCPU will
+resume. To dequeue a packet from *port*, use *port_wait*(). Multiple threads may
+use *port_wait*() to dequeue packets, enabling the use of a thread pool to
+handle traps.

-When *fifo* is created, its *elem_size* must be equivalent to
-*sizeof(mx_guest_packet_t)*.
-
-*kind* may be either *MX_GUEST_TRAP_MEMORY* or *MX_GUEST_TRAP_IO*. If
-*MX_GUEST_TRAP_MEMORY* is specified, then *addr* and *len* must both be
+*kind* may be either *MX_GUEST_TRAP_MEM* or *MX_GUEST_TRAP_IO*. If
+*MX_GUEST_TRAP_MEM* is specified, then *addr* and *len* must both be
 page-aligned.

+To identify what *kind* of trap generated a packet, use *MX_PKT_TYPE_GUEST_MEM*
+and *MX_PKT_TYPE_GUEST_IO*.
+
 ## RETURN VALUE

 **guest_set_trap**() returns MX_OK on success. On failure, an error value is
@@ -42,26 +48,27 @@ returned.

 ## ERRORS

-**MX_ERR_ACCESS_DENIED** *guest* or *fifo* do not have the *MX_RIGHT_WRITE*
+**MX_ERR_ACCESS_DENIED** *guest* or *port* do not have the *MX_RIGHT_WRITE*
 right.

-**MX_ERR_BAD_HANDLE** *guest* or *fifo* are invalid handles.
+**MX_ERR_BAD_HANDLE** *guest* or *port* are invalid handles.

-**MX_ERR_INVALID_ARGS** *kind* is not a valid address space, or *addr* or
-*len* does not meet the requirements of *kind*.
+**MX_ERR_INVALID_ARGS** *kind* is not a valid address space, *addr* or *len*
+do not meet the requirements of *kind*, or *len* is 0.

 **MX_ERR_NO_MEMORY** Temporary failure due to lack of memory.

 **MX_ERR_OUT_OF_RANGE** The region specified by *addr* and *len* is outside of
 of the valid bounds of the address space *kind*.

-**MX_ERR_WRONG_TYPE** *guest* is not a handle to a guest, or *fifo* is not a
-handle to a FIFO.
+**MX_ERR_WRONG_TYPE** *guest* is not a handle to a guest, or *port* is not a
+handle to a port.

 ## SEE ALSO

-[fifo_create](fifo_create.md),
 [guest_create](guest_create.md),
+[port_create](port_create.md),
+[port_wait](port_wait.md),
 [vcpu_create](vcpu_create.md),
 [vcpu_resume](vcpu_resume.md),
 [vcpu_interrupt](vcpu_interrupt.md),
@@ -65,6 +65,8 @@ Where *condition* is one of
  a new socket.
 + **MX_POL_NEW_FIFO** a process under this job is attempting to create
  a new fifo.
+ **MX_POL_NEW_TIMER** a process under this job is attempting to create
+  a new timer.
 + **MX_POL_NEW_ANY** is a special *condition* that stands for all of
  the above **MX_NEW** condtions such as **MX_POL_NEW_VMO**,
  **MX_POL_NEW_CHANNEL**, **MX_POL_NEW_EVENT**, **MX_POL_NEW_EVPAIR**,
@@ -15,9 +15,7 @@ mx_status_t mx_nanosleep(mx_time_t deadline);
 ## DESCRIPTION

 **nanosleep**() suspends the calling thread execution until *deadline* passes on
-**MX_CLOCK_MONOTONIC**. The special value **MX_TIME_INFINITE** suspends the
-calling thread execution indefinitely. The value **0** immediately yields the
-thread.
+**MX_CLOCK_MONOTONIC**. The value **0** immediately yields the thread.

 To sleep for a duration, use [**mx_deadline_after**](deadline_after.md) and the
 **MX_\<time-unit\>** helpers:
@@ -134,13 +134,34 @@ provided Resource handle.

 ```
 typedef struct mx_info_thread {
+    // One of MX_THREAD_STATE_* values.
+    uint32_t state;
+
    // If nonzero, the thread has gotten an exception and is waiting for
    // the exception to be handled by the specified port.
    // The value is one of MX_EXCEPTION_PORT_TYPE_*.
+    // Note: If the thread is waiting for an exception response then |state|
+    // will have the value MX_THREAD_STATE_BLOCKED.
    uint32_t wait_exception_port_type;
 } mx_info_thread_t;
 ```

+The values in this struct are mainly for informational and debugging
+purposes at the moment.
+
+The **MX_THREAD_STATE_\*** values are defined by
+
+```
+#include <magenta/syscalls/object.h>
+```
+
+*   *MX_THREAD_STATE_NEW*
+*   *MX_THREAD_STATE_RUNNING*
+*   *MX_THREAD_STATE_SUSPENDED*
+*   *MX_THREAD_STATE_BLOCKED*
+*   *MX_THREAD_STATE_DYING*
+*   *MX_THREAD_STATE_DEAD*
+
 The **MX_EXCEPTION_PORT_TYPE_\*** values are defined by

 ```
@@ -22,8 +22,9 @@ have a maximum capacity.

 Data written to one handle may be read from the opposite.

-The *options* must currently be either **MX_SOCKET_STREAM** or
-**MX_SOCKET_DATAGRAM**.
+The *options* must set either the **MX_SOCKET_STREAM** or
+**MX_SOCKET_DATAGRAM** flag. The **MX_SOCKET_HAS_CONTROL** flag
+can also be set to enable the socket control plane.

 ## RETURN VALUE

@@ -30,6 +30,9 @@ If the socket was created with **MX_SOCKET_DATAGRAM** and *buffer*
 is too small for the packet, then the packet will be truncated,
 and any remaining bytes in the packet are discarded.

+If *options* is set to **MX_SOCKET_CONTROL**, then **socket_read**()
+attempts to read from the socket control plane.
+
 ## RETURN VALUE

 **socket_read**() returns **MX_OK** on success, and writes into
@@ -39,19 +42,23 @@ and any remaining bytes in the packet are discarded.

 **MX_ERR_BAD_HANDLE**  *handle* is not a valid handle.

+**MX_ERR_BAD_STATE** *options* includes **MX_SOCKET_CONTROL** and the
+socket was not created with **MX_SOCKET_HAS_CONTROL**.
+
 **MX_ERR_WRONG_TYPE**  *handle* is not a socket handle.

 **MX_ERR_INVALID_ARGS** If any of *buffer* or *actual* are non-NULL
 but invalid pointers, or if *buffer* is NULL but *size* is positive,
-or if *options* is nonzero.
+or if *options* is not either zero or **MX_SOCKET_CONTROL*.

 **MX_ERR_ACCESS_DENIED**  *handle* does not have **MX_RIGHT_READ**.

 **MX_ERR_SHOULD_WAIT**  The socket contained no data to read.

-**MX_ERR_PEER_CLOSED**  The other side of the socket is closed, or this
-side of the socket has been previously closed via a write with the
-**MX_SOCKET_HALF_CLOSE** flag.
+**MX_ERR_PEER_CLOSED**  The other side of the socket is closed and no data is
+readable.
+
+**MX_ERR_BAD_STATE**  Reading has been disabled for this socket endpoint.

 **MX_ERR_NO_MEMORY**  (Temporary) Failure due to lack of memory.

@@ -20,10 +20,23 @@ mx_status_t mx_socket_write(mx_handle_t handle, uint32_t options,
 specified by *handle*.  The pointer to *bytes* may be NULL if *size*
 is zero.

-There is one value (besides 0) that may be passed to *options*. If
-**MX_SOCKET_HALF_CLOSE** is passed to options, and *size* is 0, then the
-socket endpoint at *handle* is closed. Further writes to the other
-endpoint of the socket will fail with **MX_ERR_BAD_STATE**.
+If *size* is zero, a bitwise combination of **MX_SOCKET_SHUTDOWN_READ** and
+**MX_SOCKET_SHUTDOWN_WRITE** can be passed to *options* to disable reading or
+writing from a socket endpoint.
+
+If **MX_SOCKET_SHUTDOWN_READ** is passed to *options*, and *size* is 0, then reading is disabled for
+the socket endpoint at *handle*. All data buffered in the socket at the time of the call may be
+read, but further reads from this endpoint or writes to the other endpoint of the socket will fail
+with **MX_ERR_BAD_STATE**.
+
+If **MX_SOCKET_SHUTDOWN_WRITE** is passed to *options*, and *size* is 0, then writing is disabled for
+the socket endpoint at *handle*. Further writes to this endpoint or reads from the other endpoint of
+the socket will fail with **MX_ERR_BAD_STATE**.
+
+If **MX_SOCKET_CONTROL** is passed to *options*, then **socket_write**() attempts to write
+into the socket control plane. A write to the control plane is never short. If the socket
+control plane has insufficient space for *buffer*, it writes nothing and returns
+**MX_ERR_OUT_OF_RANGE**.

 If a NULL *actual* is passed in, it will be ignored.

@@ -43,6 +56,9 @@ insufficient space for *buffer*, it writes nothing and returns

 **MX_ERR_BAD_HANDLE**  *handle* is not a valid handle.

+**MX_ERR_BAD_STATE** *options* includes **MX_SOCKET_CONTROL** and the
+socket was not created with **MX_SOCKET_HAS_CONTROL**.
+
 **MX_ERR_WRONG_TYPE**  *handle* is not a socket handle.

 **MX_ERR_INVALID_ARGS**  *buffer* is an invalid pointer, or
@@ -55,8 +71,7 @@ not 0, or *options* was not 0 or **MX_SOCKET_HALF_CLOSE**.
 the socket was created with **MX_SOCKET_DATAGRAM** and *buffer* is
 larger than the remaining space in the socket.

-**MX_ERR_BAD_STATE**  This side of the socket has been closed by a prior write
-to the other side with **MX_SOCKET_HALF_CLOSE**.
+**MX_ERR_BAD_STATE**  Writing has been disabled for this socket endpoint.

 **MX_ERR_PEER_CLOSED**  The other side of the socket is closed.

@@ -3,7 +3,7 @@
 ## NAME

 task_bind_exception_port - Bind to, or unbind from, the exception port
-corresponding to a given process or thread or the system exception port.
+corresponding to a given job, process, thread, or the system exception port.

 ## SYNOPSIS

@@ -17,7 +17,7 @@ mx_status_t mx_task_bind_exception_port(mx_handle_t object, mx_handle_t eport,
 ## DESCRIPTION

 **task_bind_exception_port**() is used to bind (or unbind) a port to
-the exception port of a process or thread, or the system exception port.
+the exception port of a job, process, thread, or the system exception port.

 To bind to the system exception port pass **MX_HANDLE_INVALID** for *object*.

@@ -93,6 +93,12 @@ There is only one thread exception port per thread.
 - Process - This is for exception ports bound directly to the process.
 There is only one process exception port per process.

+- Job - This is for exception ports bound to the process's job. Note that jobs
+have a hierarchy. First the process's job is searched. If it has a bound
+exception port then the exception is delivered to that port. If it does not
+have a bound exception port, or if the handler returns **MX_RESUME_TRY_NEXT**,
+then that job's parent job is searched, and so on right up to the root job.
+
 - System - This is the last port searched and gives the system a chance to
 process the exception before the kernel kills the process.

@@ -122,7 +128,7 @@ to the system exception port *object* is **MX_HANDLE_INVALID**.
 Also note that when unbinding from an exception port *eport* is
 **MX_HANDLE_INVALID**.

-**MX_ERR_WRONG_TYPE**  *object* is not that of a thread or process,
+**MX_ERR_WRONG_TYPE**  *object* is not that of a job, process, or thread,
 and is not **MX_HANDLE_INVALID**,
 or *eport* is not that of a port and is not **MX_HANDLE_INVALID**.

@@ -16,11 +16,11 @@ mx_status_t mx_timer_cancel(mx_handle_t handle);
 ## DESCRIPTION

 **mx_timer_cancel**() cancels a pending timer that was started with
-**timer_start**().
+**timer_set**().

 Upon success the pending timer is canceled and the MX_TIMER_SIGNALED
 signal is de-asserted. If a new pending timer is immediately needed
-rather than calling **timer_cancel**() first, call **timer_start**()
+rather than calling **timer_cancel**() first, call **timer_set**()
 with the new deadline.

 ## RETURN VALUE
@@ -36,9 +36,9 @@ In the event of failure, a negative error value is returned.

 ## NOTE

-Calling this function before **timer_start**() has no effect.
+Calling this function before **timer_set**() has no effect.

 ## SEE ALSO

 [timer_create](timer_create.md),
-[timer_start](timer_start.md)
+[timer_set](timer_set.md)
@@ -17,8 +17,22 @@ mx_status_t mx_timer_create(uint32_t options, uint32_t clock_id, mx_handle_t* ou

 **timer_create**() creates a timer, an object that can signal
 when a specified point in time has been reached. The only valid
-value for *options* is zero and the only valid *clock_id* is
-MX_CLOCK_MONOTONIC
+*clock_id* is MX_CLOCK_MONOTONIC.
+
+The *options* value specifies the coalescing behavior which
+controls whether the system can fire the time earlier or later
+depending on other pending timers.
+
+The possible values are:
+
+ **MX_TIMER_SLACK_CENTER** coalescing is allowed with earlier and
+  later timers.
+ **MX_TIMER_SLACK_EARLY** coalescing is allowed only with earlier
+  timers.
+ **MX_TIMER_SLACK_LAYE** coalescing is allowed only with later
+  timers.
+
+Passing 0 in options is equivalent to MX_TIMER_SLACK_CENTER.

 The returned handle has the MX_RIGHT_DUPLICATE, MX_RIGHT_TRANSFER,
 MX_RIGHT_READ and MX_RIGHT_WRITE right.
@@ -31,13 +45,14 @@ of failure, a negative error value is returned.
 ## ERRORS

 **MX_ERR_INVALID_ARGS**  *out* is an invalid pointer or NULL or
-*options* or *clock_id* is any value other than MX_CLOCK_MONOTONIC.
+*options* is not one of the MX_TIMER_SLACK values or *clock_id* is
+any value other than MX_CLOCK_MONOTONIC.

 **MX_ERR_NO_MEMORY**  (Temporary) Failure due to lack of memory.

 ## SEE ALSO

-[timer_start](timer_start.md),
+[timer_set](timer_set.md),
 [timer_cancel](timer_cancel.md),
 [deadline_after](deadline_after.md),
 [handle_close](handle_close.md)
@@ -0,0 +1,61 @@
+# mx_timer_set
+
+## NAME
+
+timer_set - start a timer
+
+## SYNOPSIS
+
+```
+#include <magenta/syscalls.h>
+
+mx_status_t mx_timer_set(mx_handle_t handle, mx_time_t deadline,
+                         mx_duration_t slack);
+
+```
+
+## DESCRIPTION
+
+**mx_timer_set**() starts a one-shot timer that will fire when
+*deadline* passes. If a previous call to **mx_timer_set**() was
+pending, the previous timer is canceled and
+*MX_TIMER_SIGNALED* is de-asserted as needed.
+
+The *deadline* parameter specifies a deadline with respect to
+**MX_CLOCK_MONOTONIC**. To wait for a relative interval,
+use **mx_deadline_after**() returned value in *deadline*.
+
+To fire the timer immediately pass 0 to *deadline*.
+
+When the timer fires it asserts *MX_TIMER_SIGNALED*. To de-assert this
+signal call **timer_cancel**() or **timer_set**() again.
+
+The *slack* parameter specifies a range from *deadline* - *slack* to
+*deadline* + *slack* during which the timer is allowed to fire. The system
+uses this parameter as a hint to coalesce nearby timers.
+
+The precise coalescing behavior is controlled by the *options* parameter
+specified when the timer was created. **MX_TIMER_SLACK_EARLY** allows only
+firing in the *deadline* - *slack* interval and **MX_TIMER_SLACK_LATE**
+allows only firing in the *deadline* + *slack* interval. The default
+option value of 0 is **MX_TIMER_SLACK_CENTER** and allows both early and
+late firing with an effective interval of *deadline* - *slack* to
+*deadline* + *slack*
+
+## RETURN VALUE
+
+**mx_timer_set**() returns **MX_OK** on success.
+In the event of failure, a negative error value is returned.
+
+
+## ERRORS
+
+**MX_ERR_BAD_HANDLE**  *handle* is not a valid handle.
+
+**MX_ERR_ACCESS_DENIED**  *handle* lacks the right *MX_RIGHT_WRITE*.
+
+## SEE ALSO
+
+[timer_create](timer_create.md),
+[timer_cancel](timer_cancel.md),
+[deadline_after](deadline_after.md)
@@ -1,63 +0,0 @@
-# mx_timer_start
-
-## NAME
-
-timer_start - start a timer
-
-## SYNOPSIS
-
-```
-#include <magenta/syscalls.h>
-
-mx_status_t mx_timer_start(mx_handle_t handle, mx_time_t deadline, mx_duration_t period,
-                           mx_duration_t slack);
-
-```
-
-## DESCRIPTION
-
-**mx_timer_start**() starts a timer that will fire when *deadline* passes and
-optionally continue firing afterwards when each *period* has elapsed.
-
-The *deadline* parameter specifies a deadline with respect to
-**MX_CLOCK_MONOTONIC** and cannot be zero. To wait for a relative interval,
-use **mx_deadline_after**() returned value in *deadline*.
-
-If *period* is zero, the timer is one-shot and when the timer fires it
-asserts *MX_TIMER_SIGNALED*. To de-assert this signal call **timer_cancel**()
-or **timer_start**() again.
-
-If *period* is at least *MX_TIMER_MIN_PERIOD* the timer will fire
-when *deadline* passes, then at *dealine* + *period* and so on. In this
-mode the *MX_TIMER_SIGNALED* signal is not asserted but strobed.
-This means that it can satisfy an existing wait operation or generate a
-port signal packet, but it cannot be reliably inspected.
-
-The *slack* parameter should be set to zero.
-
-## RETURN VALUE
-
-**mx_timer_start**() returns **MX_OK** on success.
-In the event of failure, a negative error value is returned.
-
-
-## ERRORS
-
-**MX_ERR_BAD_HANDLE**  *handle* is not a valid handle.
-
-**MX_ERR_ACCESS_DENIED**  *handle* lacks the right *MX_RIGHT_WRITE*.
-
-**MX_ERR_INVALID_ARGS**  *deadline* is less than *MX_TIMER_MIN_DEADLINE*
-
-**MX_ERR_NOT_SUPPORTED**  *period* is less than *MX_TIMER_MIN_PERIOD*.
-
-## NOTE
-
-*slack* is ignored at the moment. It will be used to coalesce timers.
-
-
-## SEE ALSO
-
-[timer_create](timer_create.md),
-[timer_cancel](timer_cancel.md),
-[deadline_after](deadline_after.md)
@@ -8,9 +8,9 @@ vcpu_resume - resume execution of a VCPU

 ```
 #include <magenta/syscalls.h>
-#include <magenta/syscalls/hypervisor.h>
+#include <magenta/syscalls/port.h>

-mx_status_t mx_vcpu_resume(mx_handle_t vcpu, mx_guest_packet_t* packet);
+mx_status_t mx_vcpu_resume(mx_handle_t vcpu, mx_port_packet_t* packet);
 ```

 ## DESCRIPTION
@@ -54,12 +54,12 @@ It’s possible to end up in a situation where the machine *really* wants to hel
 7. Check that “Windows Boot Manager” didn’t get moved to the top of the boot order, fix it if it did

 ## How to Create a Bootable USB Flash Drive
-1. Build the bootloader
-  * `(cd $MAGENTA_ROOT; make bootloader)`
+1. Build everything
+  * `(cd $FUCHSIA_ROOT; fbuild)`
 2. Format your USB Flash Drive with a FAT32 partition as the first partition
-3. Copy `$MAGENTA_ROOT/build-magenta-pc-x86-64/bootloader/bootx64.efi` to `EFI/BOOT/BOOTX64.EFI` on the USB Flash Drive.
+3. Copy `$FUCHSIA_ROOT/out/build-magenta/build-magenta-pc-x86-64/bootloader/bootx64.efi` to `EFI/BOOT/BOOTX64.EFI` on the USB Flash Drive.
 If you plan to netboot, you're done.
-4. Copy `build-magenta-pc-x86-64/magenta.bin` to the root of the USB Flash Drive
+4. Copy `$FUCHSIA_ROOT/out/build-magenta/build-magenta-pc-x86-64/magenta.bin` to the root of the USB Flash Drive
 5. Optionally copy an additional bootfs image to `ramdisk.bin` on the root of the USB Flash Drive (for a Fuchsia build, a bootfs image can be found at `$FUCHSIA_ROOT/out/debug-x86-64/user.bootfs`)

 If you need to boot magenta over the network, skip step 4 and/or delete
@@ -7,7 +7,7 @@ WARNING:  These are directions to configure the machine and boot an experimental
 These instructions configure the machine to boot from a USB flash drive.

 1. Remove four bottom plate screws and bottom plate
-2. Install memory (and optionally M.2 SSD)
+2. Install memory (and optionally M.2 SSD (only SATA is supported; NVMe lacks a driver))
 3. Boot into Visual BIOS (F2)
 4. Select the Wrench menu (upper right), select Visual Bios Settings
 5. Disable Internet Updates (Requires a mouse due to the wonders of Visual BIOS)
@@ -62,7 +62,7 @@ private:
 int Example::IncreaseFoo(int by) {
    int new_value;
    {
-        AutoLock lock(&lock_);  // mxtl::AutoLock is annotated
+        AutoLock lock(&lock_);  // fbl::AutoLock is annotated
        new_value = IncreaseFooLocked(by);
    }
    return new_value;
@@ -0,0 +1,256 @@
+# Fuchsia Tracing System Design
+
+This document describes a mechanism for collecting diagnostic trace information
+from running applications on the Fuchsia operating system.
+
+## Overview
+
+The purpose of Fuchsia tracing is to provide a means to collect, aggregate,
+and visualize diagnostic tracing information from Fuchsia user space
+processes and from the Magenta kernel.
+
+## Design Goals
+
+- Lightweight Instrumentation
+  - Enabling tracing should not significantly affect the performance of running
+    applications.  Trace providers should not need to acquire locks, make
+    syscalls, or perform dynamic memory allocation required between the time
+    when tracing is activated and when it is disabled.
+- Compact Memory Footprint
+  - Trace records are stored compactly in memory so that buffers can remain
+    small but hold many events.
+- Crash-proof
+  - It is possible to collect partial traces even if trace providers
+    terminate (normally or abnormally) during trace collection.
+- Flexible and Universal
+  - Can trace code written in any language given a suitable implementation of
+    the tracing library.
+  - Trace points can be manually inserted by the developer or generated
+    dynamically by tools.
+- General
+  - The trace format defines general purpose record types which support a
+    wide range of data collection needs.
+  - Trace data can be transformed into other formats for visualization using
+    tools such as Catapult or TraceViz.
+- Extensible
+  - New record types can be added in the future without breaking existing tools.
+- Robust
+  - Enabling tracing does not compromise the integrity of running components
+    or expose them to manipulation by tracing clients.
+
+## Moving Parts
+
+### Trace Manager
+
+The trace manager is a system service which coordinates registration of
+trace providers.  It ensures that tracing proceeds in an orderly manner
+and isolates components which offer trace providers from trace clients.
+
+The trace manager implements two FIDL interfaces:
+
+- `TraceController`: Provides trace clients with the ability to enumerate
+  trace providers and collect trace data.
+- `TraceRegistry`: Provides trace providers with the ability to register
+  themselves at runtime so that they can be discovered by the tracing system.
+
+TODO: The `TraceRegistry` should be replaced by a `Namespace` based approach
+to publish trace providers from components.
+
+### Trace Providers
+
+Components which can be traced or offer tracing information to the system
+implement the `TraceProvider` FIDL interface and register it with the
+`TraceRegistry`.  Once registered, they will receive messages whenever
+tracing is started or stopped and will have the opportunity to provide
+trace data encoded in the [Fuchsia Trace Format](trace_format.md).
+
+#### Kernel Trace Provider
+
+The `ktrace_provider` program ingests kernel trace events and publishes
+trace records.  This allows kernel trace data to be captured and visualized
+together with userspace trace data.
+
+### Trace Client
+
+The `trace` program offers command-line access to tracing functionality
+for developers.  It also support converting Fuchsia trace archives into
+other formats, such as Catapult JSON records which can be visualized
+using Catapult (aka. chrome:://tracing).
+
+Trace information can also be collected programmatically by using the
+`TraceController` FIDL interface directly.
+
+## Libraries
+
+### libtrace: The C and C++ Trace Event Library
+
+Provides macros and inline functions for instrumenting C and C++ programs
+with trace points for capturing trace data during trace execution.
+
+See `<trace/event.h>`.
+
+#### C++ Example
+
+This example records trace events marking the beginning and end of the
+execution of the "DoSomething" function together with its parameters.
+
+```c++
+#include <trace/event.h>
+
+void DoSomething(int a, std::string b) {
+  TRACE_DURATION("example", "DoSomething", "a", a, "b", b);
+
+  // Do something
+}
+```
+
+#### C Example
+
+This example records trace events marking the beginning and end of the
+execution of the "DoSomething" function together with its parameters.
+
+Unlike in C++, it is necessary to specify the type of each trace argument.
+In C++ such annotations are supported but are optional since the compiler
+can infer the type itself.
+
+```c
+#include <trace/event.h>
+
+void DoSomething(int a, const char* b) {
+  TRACE_DURATION("example", "DoSomething", "a", TA_INT32(a), "b", TA_STRING(b));
+
+  // Do something
+}
+```
+
+#### Suppressing Tracing Within a Compilation Unit
+
+To completely suppress tracing within a compilation unit, define the NTRACE
+macro prior to including the trace headers.  This causes the macros to
+behave as if tracing is always disabled so they will not produce trace
+records and they will have zero runtime overhead.
+
+```c
+#define NTRACE
+#include <trace/event.h>
+
+void DoSomething(void) {
+  // This will never produce trace records because the NTRACE macro was
+  // defined above.
+  TRACE_DURATION("example", "DoSomething");
+}
+```
+
+### libtrace-provider: Trace Provider Library
+
+This library provides C and C++ functions to register a process's trace
+engine with the Fuchsia tracing system.  For tracing to work in your process,
+you must initialize the trace provider at some point during its execution
+(or implement your own trace handler to register the trace engine some
+other way).
+
+The trace provider requires an asynchronous dispatcher to operate.
+
+#### C++ Example
+
+```c++
+#include <async/loop.h>
+#include <trace-provider/provider.h>
+
+int main(int argc, char** argv) {
+  // Create a message loop.
+  async::Loop loop;
+
+  // Start a thread for the loop to run on.
+  // We could instead use async_loop_run() to run on the current thread.
+  mx_status_t status = loop.StartThread();
+  if (status != MX_OK) exit(1);
+
+  // Create the trace provider.
+  trace::TraceProvider trace_provider(loop.async());
+
+  // Do something...
+
+  // The loop and trace provider will shut down once the scope exits.
+  return 0;
+}
+```
+
+#### C Example
+
+```c
+#include <async/loop.h>
+#include <trace-provider/provider.h>
+
+int main(int argc, char** argv) {
+  mx_status_t status;
+  async_t* async;
+  trace_provider_t* trace_provider;
+
+  // Create a message loop.
+  status = async_loop_create(NULL, &async);
+  if (status != MX_OK) exit(1);
+
+  // Start a thread for the loop to run on.
+  // We could instead use async_loop_run() to run on the current thread.
+  status = async_loop_start_thread(async, "loop", NULL);
+  if (status != MX_OK) exit(1);
+
+  // Create the trace provider.
+  trace_provider = trace_provider_create(async);
+  if (!trace_provider) exit(1);
+
+  // Do something...
+
+  // Tear down.
+  trace_provider_destroy(trace_provider);
+  async_loop_shutdown(async);
+  return 0;
+}
+```
+
+### libtrace-reader: Trace Reader Library
+
+Provides C++ types and functions for reading trace archives.
+
+See `<trace-reader/reader.h>`.
+
+## Transport Protocol
+
+When the developer initiates tracing, the trace manager asks all relevant
+trace providers to start tracing and provides each one with a trace buffer
+VMO into which they should write their trace records.
+
+While a trace is running, the trace manager continues watching for newly
+registered trace providers and activates them if needed.
+
+If a trace provider's trace buffer becomes full while a trace is running,
+that trace provider will stop recording events but other trace providers will
+continue to record trace events into their own buffers as usual until the
+trace stops as usual.  This may result in a partially incomplete trace.
+
+TODO(MG-1107): Improve buffering behavior to support continuous tracing.
+
+When tracing finishes, the trace manager asks all of the active trace providers
+to stop tracing then waits a short time for them to acknowledge that they
+have finished writing out their trace events.
+
+The trace manager then reads and validates trace data written into the trace
+buffer VMOs by trace providers and creates a trace archive.  The trace manager
+can often recover partial data even when trace providers terminate abnormally
+as long as they managed to store some data into their trace buffers.
+
+The trace manager delivers the resulting trace archive to its client through
+a socket.  This data is guaranteed to be well-formed according to the
+Fuchsia trace format (but it may be nonsensical if trace providers
+deliberately emit garbage data).
+
+These are some important invariants of the transport protocol:
+- There are no synchronization points between the trace manager and trace
+  providers other than starting or stopping collection.
+- Trace providers (components being traced) only ever write to trace buffers;
+  they never read from them.
+- The trace manager only ever reads from trace buffers; it never writes to them.
+- Trace clients never see the original trace buffers; they receive trace
+  archives over a socket from the trace manager.  This protects trace providers
+  from manipulation by trace clients.
@@ -0,0 +1,873 @@
+# Fuchsia Trace Format
+
+This document describes the binary format used to collect, store, and
+transmit Fuchsia trace records.
+
+See [Fuchsia Tracing](tracing.md) for an overview.
+
+## Purpose
+
+While a trace is running, _trace providers_ write records into a trace buffer
+VMO shared with the trace manager using the binary format described in this
+document.
+
+The binary format is designed to introduce minimal impact upon the
+performance of the subject under trace while writing traces.  The records
+are also written sequentially so that if a trace terminates (normally or
+abnormally), the trace manager can still recover partial trace data already
+stored in the trace buffer by reading everything up to the last well-formed
+record.
+
+As the trace progresses, the _trace manager_ aggregates records from all
+trace providers which are participating in trace collection and concatenates
+them together with some special metadata records to form a trace archive.
+
+Once the trace completes, tools such as the `trace` command-line program
+can read the trace records within the trace archive to visualize the results
+or save them to a file for later consumption.
+
+## Features
+
+- Small footprint
+  - Trace records are compact, packing information into a small number of bits.
+  - Pooling strings, processes, and threads further compacts the trace data.
+- Memory aligned
+  - Trace records maintain an 8 byte alignment in memory to facilitate
+    writing them directly into memory mapped VMOs.
+- Variable size records
+  - Overall record size is limited to 32 KB.
+  - Large objects may need to be broken up into multiple records.
+- Extensible
+  - There’s room to define new record types as needed.
+  - Unrecognized or malformed trace records can be skipped.
+
+## Encoding Primitives
+
+### Records
+
+A trace record is a binary encoded piece of trace information consisting of
+a sequence of [atoms](#atoms).
+
+All records include a header word which contains the following basic
+information:
+
+- **Record Type**: A 4-bit field which identifies the type of the record
+  and the information it contains.  See [Record Types](#record-types).
+- **Record Size**: A 12-bit field which indicates the number of words
+  (multiples of 8 byte units) within the record _including the record
+  header itself_.  The maximum possible size of a record is 4095 words
+  (32760 bytes).  Very simple records may be just 1 word (8 bytes) long.
+
+Records are always a multiple of 8 bytes in length and are stored with
+8 byte alignment.
+
+### Atoms
+
+Each record is constructed as a sequence of atoms.
+
+Each atom is written with 8 byte alignment and has a size which is also a
+multiple of 8 bytes so as to preserve alignment.
+
+There are two kinds of atoms:
+
+- **Word**: A 64-bit value which may be further subdivided into bit fields.
+  Words are stored in machine word order (little-endian on all currently
+  supported architectures).
+- **Stream**: A sequence of bytes padded with zeros to the next 8 byte
+  boundary.  Streams are stored in byte order.  Streams which are an exact
+  multiple of 8 bytes long are not padded (there is no zero terminator).
+
+**Fields** are subdivisions of 64-bit **Words**, denoted
+`[<least significant bit> .. <most significant bit>]` where the first and
+last bit positions are inclusive.  All unused bits are reserved for future
+use and must be set to 0.
+
+**Words** and **Fields** store unsigned integers unless otherwise specified
+by the record format.
+
+**Streams** may store either UTF-8 strings or binary data, as specified by
+the record format.
+
+### Archives
+
+A trace archive is a sequence of trace records, concatenated end to end,
+which stores information collected by trace providers while a trace is
+running together with metadata records which identify and delimit sections
+of the trace produced by each trace provider.
+
+Trace archives are intended to be read sequentially since records which
+appear earlier in the trace may influence the interpretation of records
+which appear later in the trace.  The trace system provides tools for
+extracting information from trace archives and converting it into other
+forms for visualization.
+
+### Timestamps
+
+Timestamps are represented as 64-bit ticks derived from a hardware counter.
+The trace initialization record describes the number of ticks per second
+of real time.
+
+By default, we assume that 1 tick equals 1 nanosecond.
+
+### String References
+
+Strings are encoded as **String Refs** which are 16-bit values of the
+following form:
+
+- **Empty strings**: Value is zero.
+- **Indexed strings**: Most significant bit is zero.  The lower 15 bits
+  denote an index in the **string table** which was previously assigned using a
+  **String Record**.
+- **Inline strings**: Most significant bit is one.  The lower 15 bits
+  denote the length of the string in bytes.  The string's content appears
+  inline in another part of the record as specified by the record format.
+
+To make traces more compact, frequently referenced strings, such as event
+category and name constants, should be registered into the **string table**
+using **String Records** then referenced by index.
+
+There can be at most 32767 strings in the string table.  If this limit is
+reached, additional strings can be encoded by replacing existing entries
+or by encoding strings inline.
+
+String content itself is stored as a UTF-8 **Stream** without termination.
+
+The theoretical maximum length of a string is 32767 bytes but in practice this
+will be further reduced by the space required to store the rest of the record
+which contains it, so we set a conservative maximum string length limit of
+32000 bytes.
+
+### Thread References
+
+Thread and process kernel object ids (koids) are encoded as **Thread Refs**
+which are 8-bit values of the following form:
+
+- **Inline threads**: Value is zero.  The thread and process koid appears
+  inline in another part of the record as specified by the record format.
+- **Indexed threads**: Value is non-zero.  The value denotes an index in
+  the **thread table** which was previously assigned using a **Thread Record**.
+
+To make traces more compact, frequently referenced threads should be registered
+into the **thread table** using **Thread Records** then referenced by index.
+
+There can be at most 255 threads in the string table.  If this limit is
+reached, additional threads can be encoded by replacing existing entries
+or by encoding threads inline.
+
+### Userspace Object Information
+
+Traces can include annotations about userspace objects (anything that can be
+referenced using a pointer-like value such as a C++ or Dart object) in the
+form of **Userspace Object Records**.  Trace providers typically generate
+such records when the object is created.
+
+Thereafter, any **Pointer Arguments** which refer to the same pointer will
+be associated with the referent's annotations.
+
+This makes it easy to associate human-readable labels and other information
+with objects which appear later in the trace.
+
+### Kernel Object Information
+
+Traces can include annotations about kernel objects (anything that can be
+referenced using a Magenta koid such as a process, channel, or event)
+form of **Kernel Object Records**.  Trace providers typically generate such
+records when the object is created.
+
+Thereafter, any **Kernel Object Id Arguments** which refer to the same koid will
+be associated with the referent's annotations.
+
+This makes it easy to associate human-readable labels and other information
+with objects which appear later in the trace.
+
+In particular, this is how the tracing system associates names with process
+and thread koids.
+
+### Arguments
+
+Arguments are typed key value pairs.
+
+Many record types allow up to 15 arguments to be appended to the record to
+provide additional information from the developer.
+
+Arguments are size-prefixed like ordinary records so that unrecognized
+argument types can be skipped.
+
+See also [Argument Types](#argument-types).
+
+## Extending the Format
+
+The trace format can be extended in the following ways:
+
+- Defining new record types.
+- Storing new information in reserved fields of existing record types.
+- Appending new information to existing record types (the presence of this
+  information can be detected by examining the record's size and payload).
+- Defining new argument types.
+
+_To preserve compatibility as the trace format evolves, all extensions must be
+documented authoritatively in this file.  Currently there is no support for
+private extensions._
+
+## Notation
+
+In the record format descriptions which follow, each constituent atom
+is labeled in italics followed by a bullet-point description of its contents.
+
+## Record Types
+
+### Record Header
+
+All records include this header which specifies the record's type and size
+together with 48 bits of data whose usage varies by record type.
+
+##### Format
+
+_header word_
+- `[0 .. 3]`: record type
+- `[4 .. 15]`: record size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 63]`: varies by record type (must be zero if unused)
+
+### Metadata Record (record type = 0)
+
+Provides metadata about trace data which follows.
+
+This record type is reserved for use by the _trace manager_ when generating
+trace archives.  It must not be emitted by trace providers themselves.
+If the trace manager encounters a **Metadata Record** within a trace produced
+by a trace provider, it treats it as garbage and skips over it.
+
+There are several metadata record subtypes, each of which contain different
+information.
+
+##### Format
+
+_header word_
+- `[0 .. 3]`: record type (0)
+- `[4 .. 15]`: record size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 19]`: metadata type
+- `[20 .. 63]`: varies by metadata type (must be zero if unused)
+
+#### Provider Info Metadata (metadata type = 1)
+
+This metadata identifies a trace provider which has contributed information to
+the trace.
+
+All data which follows until the next **Provider Section Metadata** or
+**Provider Info Metadata** is encountered must have been collected from the
+same provider.
+
+##### Format
+
+_header word_
+- `[0 .. 3]`: record type (0)
+- `[4 .. 15]`: record size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 19]`: metadata type
+- `[20 .. 51]`: provider id (token used to identify the provider in the trace)
+- `[52 .. 59]`: name length in bytes
+- `[60 .. 63]`: reserved (must be zero)
+
+_provider name stream_
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+#### Provider Section Metadata (metadata type = 2)
+
+This metadata delimits sections of the trace which have been obtained from
+different providers.
+
+All data which follows until the next **Provider Section Metadata** or
+**Provider Info Metadata** is encountered is assumed to have been collected
+from the same provider.
+
+When reading a trace consisting of an accumulation of traces from different
+trace providers, the reader must maintain state separately for each provider’s
+traces (such as the initialization data, string table, thread table,
+userspace object table, and kernel object table) and switch contexts
+whenever it encounters a new **Provider Section Metadata** record.
+
+##### Format
+
+_header word_
+- `[0 .. 3]`: record type (0)
+- `[4 .. 15]`: record size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 19]`: metadata type
+- `[20 .. 51]`: provider id (token used to identify the provider in the trace)
+- `[52 .. 63]`: reserved (must be zero)
+
+### Initialization Record (record type = 1)
+
+Provides parameters needed to interpret the records which follow.  In absence
+of this record, the reader may assume that 1 tick is 1 nanosecond.
+
+##### Format
+
+_header word_
+- `[0 .. 3]`: record type (1)
+- `[4 .. 15]`: record size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 63]`: reserved (must be zero)
+
+_tick multiplier word_
+- `[0 .. 63]`: number of ticks per second
+
+### String Record (record type = 2)
+
+Registers a string in the string table, assigning it a string index in the
+range `0x0001` to `0x7fff`.  The registration replaces any prior registration
+for the given string index when interpreting the records which follow.
+
+String records which attempt to set a value for string index `0x0000` must be
+ignored since this value is reserved to represent the empty string.
+
+String records which contain empty strings must be tolerated but they’re
+pointless since the empty string can simply be encoded as zero in a string ref.
+
+##### Format
+
+_header word_
+- `[0 .. 3]`: record type (2)
+- `[4 .. 15]`: record size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 30]`: string index (range 0x0001 to 0x7fff)
+- `[31]`: always zero (0)
+- `[32 .. 46]`: string length in bytes (range 0x0000 to 0x7fff)
+- `[47]`: always zero (0)
+- `[48 .. 63]`: reserved (must be zero)
+
+_string value stream_
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+### Thread Record (record type = 3)
+
+Registers a process id and thread id pair in the thread table, assigning it a
+thread index in the range `0x01` to `0xff`.  The registration replaces any
+prior registration for the given thread index when interpreting the records
+which follow.
+
+Thread index `0x00` is reserved to denote the use of an inline thread id in
+a thread ref.  Thread records which attempt to set a value for this value
+must be ignored.
+
+##### Format
+
+_header word_
+- `[0 .. 3]`: record type (3)
+- `[4 .. 15]`: record size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 23]`: thread index (never 0x00)
+- `[24 .. 63]`: reserved (must be zero)
+
+_process id word_
+- `[0 .. 63]`: process koid (kernel object id)
+
+_thread id word_
+- `[0 .. 63]`: thread koid (kernel object id)
+
+### Event Record (record type = 4)
+
+Describes a timestamped event.
+
+This record consists of some basic information about the event including
+when and where it happened followed by event arguments and event subtype
+specific data.
+
+##### Format
+
+_header word_
+- `[0 .. 3]`: record type (4)
+- `[4 .. 15]`: record size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 19]`: event type
+- `[20 .. 23]`: number of arguments
+- `[24 .. 31]`: thread (thread ref)
+- `[32 .. 47]`: category (string ref)
+- `[48 .. 63]`: name (string ref)
+
+_timestamp word_
+- `[0 .. 63]`: number of ticks
+
+_process id word_ (omitted unless thread ref denotes inline thread)
+- `[0 .. 63]`: process koid (kernel object id)
+
+_thread id word_ (omitted unless thread ref denotes inline thread)
+- `[0 .. 63]`: thread koid (kernel object id)
+
+_category stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+_name stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+_argument data_ (repeats for each argument)
+- (see below)
+
+_event-type specific data_
+- (see below)
+
+#### Instant Event (event type = 0)
+
+Marks a moment in time on this thread.  These are equivalent to Magenta
+kernel probes.
+
+##### Format
+
+No event-type specific data required.
+
+#### Counter Event (event type = 1)
+
+Records sample values of each argument as data in a time series associated
+with the counter’s name and id.  The values may be presented graphically as a
+stacked area chart.
+
+##### Format
+
+_counter word_
+- `[0 .. 63]`: counter id
+
+#### Duration Begin Event (event type = 2)
+
+Marks the beginning of an operation on a particular thread.  Must be matched
+by a **Duration End Event**.  May be nested.
+
+##### Format
+
+No event-type specific data required.
+
+#### Duration End Event (event type = 3)
+
+Marks the end of an operation on a particular thread.
+
+##### Format
+
+No event-type specific data required.
+
+#### Async Begin Event (event type = 4)
+
+Marks the beginning of an operation which may span threads.  Must be matched
+by an **Async End Event** using the same async correlation id.
+
+##### Format
+
+_async correlation word_
+- `[0 .. 63]`: async correlation id
+
+#### Async Instant Event (event type = 5)
+
+Marks a moment within an operation which may span threads.  Must appear
+between **Async Begin Event** and **Async End Event** using the same async
+correlation id.
+
+##### Format
+
+_async correlation word_
+- `[0 .. 63]`: async correlation id
+
+#### Async End Event (event type = 6)
+
+Marks the end of an operation which may span threads.
+
+##### Format
+
+_async correlation word_
+- `[0 .. 63]`: async correlation id
+
+#### Flow Begin Event (event type = 7)
+
+Marks the beginning of an operation which results in a sequence of actions
+which may span multiple threads or abstraction layers.  Must be matched by a
+**Flow End Event** using the same flow correlation id.  This can be envisioned
+as an arrow between duration events.
+
+The beginning of the flow is associated with the enclosing duration event
+for this thread; it begins where the enclosing **Duration Event** ends.
+
+##### Format
+
+_flow correlation word_
+- `[0 .. 63]`: flow correlation id
+
+#### Flow Step Event (event type = 8)
+
+Marks a point within a flow.
+
+The step is associated with the enclosing duration event for this thread;
+the flow resumes where the enclosing duration event begins then is suspended
+at the point where the enclosing **Duration Event** event ends.
+
+##### Format
+
+_flow correlation word_
+- `[0 .. 63]`: flow correlation id
+
+#### Flow End Event (event type = 9)
+
+Marks the end of a flow.
+
+The end of the flow is associated with the enclosing duration event for this
+thread; the flow resumes where the enclosing **Duration Event** begins.
+
+##### Format
+
+_flow correlation word_
+- `[0 .. 63]`: flow correlation id
+
+### Blob Record (record type = 5)
+
+Provides uninterpreted bulk data to be included in the trace.  This can be
+useful for embedding captured trace data in other formats.
+
+The blob name uniquely identifies separate blob data streams within the trace.
+By writing multiple blob records with the same name, additional chunks of
+data can be appended to a previously created blob.
+
+The blob type indicates the representation of the blob's content.
+
+##### Format
+
+_header word_
+- `[0 .. 3]`: record type (5)
+- `[4 .. 15]`: record size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 31]`: blob name (string ref)
+- `[32 .. 47]`: blob payload size in bytes (excluding padding)
+- `[48 .. 55]`: blob type
+- `[56 .. 63]`: reserved (must be zero)
+
+_blob name stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+_payload stream_ (variable size)
+- binary data, padded with zeros to 8 byte alignment
+
+##### Blob Types
+
+The following blob types are defined:
+- `0x01`: Catapult trace event data represented in JSON format
+
+### Userspace Object Record (record type = 6)
+
+Describes a userspace object, assigns it a label, and optionally associates
+key/value data with it as arguments.  Information about the object is added
+to a per-process userspace object table.
+
+When a trace consumer encounters an event with a **Pointer Argument** whose
+value matches an entry the process’s object table, it can cross-reference
+the argument’s pointer value with a prior **Userspace Object Record** to find a
+description of the referent.
+
+##### Format
+
+_header word_
+- `[0 .. 3]`: record type (6)
+- `[4 .. 15]`: record size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 23]`: process (thread ref)
+- `[24 .. 39]`: name (string ref)
+- `[40 .. 43]`: number of arguments
+- `[44 .. 63]`: reserved (must be zero)
+
+_pointer word_
+- `[0 .. 63]`: pointer value
+
+_process id word_ (omitted unless thread ref denotes inline thread)
+- `[0 .. 63]`: process koid (kernel object id)
+
+_name stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+_argument data_ (repeats for each argument)
+- (see below)
+
+### Kernel Object Record (record type = 7)
+
+Describes a kernel object, assigns it a label, and optionally associates
+key/value data with it as arguments.  Information about the object is added
+to a global kernel object table.
+
+When a trace consumer encounters an event with a **Koid Argument**
+whose value matches an entry in the kernel object table, it can
+cross-reference the argument’s koid value with a prior **Kernel Object Record**
+to find a description of the referent.
+
+##### Format
+
+_header word_
+- `[0 .. 3]`: record type (7)
+- `[4 .. 15]`: record size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 23]`: kernel object type (one of the MX_OBJ_TYPE_XXX constants from <magenta/syscalls/object.h>)
+- `[24 .. 39]`: name (string ref)
+- `[40 .. 43]`: number of arguments
+- `[44 .. 63]`: reserved (must be zero)
+
+_kernel object id word_
+- `[0 .. 63]`: koid (kernel object id)
+
+_name stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+_argument data_ (repeats for each argument)
+- (see below)
+
+##### Argument Conventions
+
+By convention, the trace writer should include the following named arguments
+when writing kernel object records about objects of particular types.  This
+helps trace consumers correlate relationships among kernel objects.
+
+_This information may not always be available._
+
+- `“process”`: for `MX_OBJ_TYPE_THREAD` objects, specifies the koid of the
+  process which contains the thread
+
+### Context Switch Record (record type = 8)
+
+Describes a context switch during which a CPU handed off control from an
+outgoing thread to an incoming thread which resumes execution.
+
+The record specifies the new state of the outgoing thread following the
+context switch.  By definition, the new state of the incoming thread is
+"running" since it was just resumed.
+
+##### Format
+
+_header word_
+- `[0 .. 3]`: record type (4)
+- `[4 .. 15]`: record size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 23]`: cpu number
+- `[24 .. 27]`: outgoing thread state (any of the values below except “running”)
+- `[28 .. 35]`: outgoing thread (thread ref)
+- `[36 .. 43]`: incoming thread (thread ref)
+- `[44 .. 63]`: reserved
+
+_timestamp word_
+- `[0 .. 63]`: number of ticks
+
+_outgoing process id word_ (omitted unless outgoing thread ref denotes inline thread)
+- `[0 .. 63]`: process koid (kernel object id)
+
+_outgoing thread id word_ (omitted unless outgoing thread ref denotes inline thread)
+- `[0 .. 63]`: thread koid (kernel object id)
+
+_incoming process id word_ (omitted unless incoming thread ref denotes inline thread)
+- `[0 .. 63]`: process koid (kernel object id)
+
+_incoming thread id word_ (omitted unless incoming thread ref denotes inline thread)
+- `[0 .. 63]`: thread koid (kernel object id)
+
+##### Thread States
+
+The following thread states are defined:
+- `0`: new
+- `1`: running
+- `2`: suspended
+- `3`: blocked
+- `4`: dying
+- `5`: dead
+
+These values align with the `MX_THREAD_STATE_XXX` constants from <magenta/syscalls/object.h>.
+
+### Log Record (record type = 9)
+
+Describes a message written to the log at a particular moment in time.
+
+##### Format
+
+_header word_
+- `[0 .. 3]`: record type (9)
+- `[4 .. 15]`: record size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 30]`: log message length in bytes (range 0x0000 to 0x7fff)
+- `[31]`: always zero (0)
+- `[32 .. 39]`: thread (thread ref)
+- `[40 .. 63]`: reserved (must be zero)
+
+_timestamp word_
+- `[0 .. 63]`: number of ticks
+
+_process id word_ (omitted unless thread ref denotes inline thread)
+- `[0 .. 63]`: process koid (kernel object id)
+
+_thread id word_ (omitted unless thread ref denotes inline thread)
+- `[0 .. 63]`: thread koid (kernel object id)
+
+_log message stream_
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+## Argument Types
+
+Arguments associate typed key/value data records.  They are used together
+with **Event Record** and **Userspace Object Record** and
+**Kernel Object Record**.
+
+Each argument consists of a one word header followed by a variable number
+words of payload.  In many cases, the header itself is sufficient to encode
+the content of the argument.
+
+### Argument Header
+
+All arguments include this header which specifies the argument's type,
+name, and size together with 32 bits of data whose usage varies by
+argument type.
+
+##### Format
+
+_argument header word_
+- `[0 .. 3]`: argument type
+- `[4 .. 15]`: argument size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 31]`: argument name (string ref)
+- `[32 .. 63]`: varies (must be zero if not used)
+
+_argument name stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+### Null Argument (argument type = 0)
+
+Represents an argument which appears in name only without a value.
+
+##### Format
+
+_argument header word_
+- `[0 .. 3]`: argument type (0)
+- `[4 .. 15]`: argument size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 31]`: argument name (string ref)
+- `[32 .. 63]`: reserved (must be zero)
+
+_argument name stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+### 32-bit Signed Integer Argument (argument type = 1)
+
+Represents a 32-bit signed integer.
+
+##### Format
+
+_argument header word_
+- `[0 .. 3]`: argument type (1)
+- `[4 .. 15]`: argument size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 31]`: argument name (string ref)
+- `[32 .. 63]`: 32-bit signed integer
+
+_argument name stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+### 32-bit Unsigned Integer Argument (argument type = 2)
+
+Represents a 32-bit unsigned integer.
+
+##### Format
+
+_argument header word_
+- `[0 .. 3]`: argument type (2)
+- `[4 .. 15]`: argument size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 31]`: argument name (string ref)
+- `[32 .. 63]`: 32-bit unsigned integer
+
+_argument name stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+### 64-bit Signed Integer Argument (argument type = 3)
+
+Represents a 64-bit signed integer.  If a value will fit in 32-bits, prefer
+using the **32-bit Signed Integer Argument** type instead.
+
+##### Format
+
+_argument header word_
+- `[0 .. 3]`: argument type (3)
+- `[4 .. 15]`: argument size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 31]`: argument name (string ref)
+- `[32 .. 63]`: reserved (must be zero)
+
+_argument name stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+_argument value word_
+- `[0 .. 63]`: 64-bit signed integer
+
+### 64-bit Unsigned Integer Argument (argument type = 4)
+
+Represents a 64-bit unsigned integer.  If a value will fit in 32-bits, prefer
+using the **32-bit Unsigned Integer Argument** type instead.
+
+##### Format
+
+_argument header word_
+- `[0 .. 3]`: argument type (4)
+- `[4 .. 15]`: argument size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 31]`: argument name (string ref)
+- `[32 .. 63]`: reserved (must be zero)
+
+_argument name stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+_argument value word_
+- `[0 .. 63]`: 64-bit unsigned integer
+
+### Double-precision Floating Point Argument (argument type = 5)
+
+Represents a double-precision floating point number.
+
+##### Format
+
+_argument header word_
+- `[0 .. 3]`: argument type (5)
+- `[4 .. 15]`: argument size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 31]`: argument name (string ref)
+- `[32 .. 63]`: reserved (must be zero)
+
+_argument name stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+_argument value word_
+- `[0 .. 63]`: double-precision floating point number
+
+### String Argument (argument type = 6)
+
+Represents a string value.
+
+##### Format
+
+_argument header word_
+- `[0 .. 3]`: argument type (6)
+- `[4 .. 15]`: argument size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 31]`: argument name (string ref)
+- `[32 .. 47]`: argument value (string ref)
+- `[48 .. 63]`: reserved (must be zero)
+
+_argument name stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+_argument value stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+### Pointer Argument (argument type = 7)
+
+Represents a pointer value.  Additional information about the referent can
+be provided by a **Userspace Object Record** associated with the same pointer.
+
+##### Format
+
+_argument header word_
+- `[0 .. 3]`: argument type (7)
+- `[4 .. 15]`: argument size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 31]`: argument name (string ref)
+- `[32 .. 63]`: reserved (must be zero)
+
+_argument name stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+_argument value word_
+- `[0 .. 63]`: the pointer value
+
+### Kernel Object Id Argument (argument type = 8)
+
+Represents a koid (kernel object id).  Additional information about the
+referent can be provided by a **Kernel Object Record** associated with the
+same koid.
+
+##### Format
+
+_argument header word_
+- `[0 .. 3]`: argument type (8)
+- `[4 .. 15]`: argument size (inclusive of this word) as a multiple of 8 bytes
+- `[16 .. 31]`: argument name (string ref)
+- `[32 .. 63]`: reserved (must be zero)
+
+_argument name stream_ (omitted unless string ref denotes inline string)
+- UTF-8 string, padded with zeros to 8 byte alignment
+
+_argument value word_
+- `[0 .. 63]`: the koid (kernel object id)
@@ -0,0 +1,179 @@
+# Magenta kernel to userspace bootstrapping (`userboot`)
+
+Magenta has a microkernel style of design.  A complexity for microkernel
+designs is how to bootstrap the initial userspace processes.  Often this
+is accomplished by having the kernel implement minimal versions of
+filesystem reading and program loading just for the purpose of
+bootstrapping, even when those kernel facilities are never used after boot
+time.  Magenta takes a different approach.
+
+[TOC]
+
+## Boot loader and kernel startup
+
+A boot loader loads the kernel into memory and transfers control to the
+kernel's startup code.  The details of the boot loader protocols are not
+described here.  The boot loaders used with Magenta load both the kernel
+image and a data blob in `BOOTDATA` format.
+The [`BOOTDATA` format](../system/public/magenta/boot/bootdata.h) is a
+simple container format that embeds items passed by the boot loader,
+including hardware-specific information,
+the [kernel "command line"](kernel_cmdline.md) giving boot options, and RAM
+disk images (which are usually compressed).  The kernel extracts some
+essential information for its own use in the early stages of booting.
+
+## BOOTFS
+
+One of the items embedded in the `BOOTDATA` blob is an initial RAM disk
+filesystem image.  The image is usually compressed using the **LZ4**
+format.  Once decompressed, the image is in **BOOTFS** format.  This is a
+trivial read-only filesystem format that simply lists file names, and for
+each file the offset and size within the BOOTFS image (both values must be
+page-aligned both fields and are limited to 32 bits).
+
+The primary BOOTFS image contains everything that the userspace system
+needs to run: executables, shared libraries, and data files.  These include
+the implementations of device drivers and more advanced filesystems that
+make it possible to read more code and data from storage or network
+devices.
+
+After the system has bootstrapped itself, the files in the primary
+BOOTFS become the read-only filesystem tree rooted at `/boot`.
+
+## Kernel loads userboot
+
+The kernel does not include any code for decompressing LZ4 format, nor
+any code for interpreting the BOOTFS format.  Instead, all of this work
+is done by the first userspace process, called `userboot`.
+
+`userboot` is a normal userspace process.  It can only make the standard
+system calls through the [vDSO](vdso.md) like any other process would, and
+is subject to the full [vDSO enforcement](vdso.md#Enforcement) regime.
+What's special about `userboot` is the way it gets loaded.
+
+`userboot` is built as an ELF dynamic shared object, using the
+same [RODSO layout](vdso.md#Read_Only-Dynamic-Shared-Object-Layout) as
+the vDSO.  Like the vDSO, the `userboot` ELF image is embedded in the
+kernel at compile time.  Its simple layout means that loading it does
+not require the kernel to interpret ELF headers at boot time.  The
+kernel only needs to know three things: the size of the read-only
+segment, the size of the executable segment, and the address of the
+`userboot` entry point.  At compile time, these values are extracted
+from the `userboot` ELF image and used as constants in the kernel code.
+
+Like any other process, `userboot` must start with the vDSO already
+mapped into its address space so it can make system calls.  The kernel
+maps both `userboot` and the vDSO into the first user process, and then
+starts it running at the `userboot` entry point.
+
+## Kernel sends `processargs` message
+
+In normal [program loading](program_loading.md),
+a [*bootstrap message*](program_loading.md#the-processargs-protocol) is
+sent to each new process.  The process's first thread receives
+a [channel](objects/channel.md) handle in a register.  It can then read
+data and handles sent by its creator.
+
+The kernel uses the exact same protocol to start `userboot`.  The kernel
+command line is split into words that become the environment strings in the
+bootstrap message.  All the handles that `userboot` itself will need, and
+that the rest of the system will need to access kernel facilities, are
+included in this message.  Following the normal format, *handle info
+entries* describe the purpose of each handle.  These include
+the [`PA_VMO_VDSO` handle](vdso.md#pa_vmo_vdso-handle).
+
+## userboot finds system calls in the vDSO
+
+The [standard convention](vdso.md#process_start_argument) for informing
+a new process of its vDSO mapping requires the process to interpret the
+vDSO's ELF headers and symbol table to locate system call entry points.
+To avoid this complexity, `userboot` finds the entry points in the vDSO
+in a different way.
+
+When the kernel maps `userboot` into the first user process, it chooses
+a random location in memory, just as normal program loading does.
+However, when it maps the vDSO in it doesn't choose another random
+location as is normal.  Instead, it places the vDSO image immediately
+after the `userboot` image in memory.  This way, the vDSO code is always
+at fixed offsets from the `userboot` code.
+
+At compile time, the symbol table entries for all the system call entry
+points are extracted from the vDSO ELF image.  These are then massaged
+into linker script symbol definitions that use each symbol's fixed
+offset into the vDSO image to define that symbol at that fixed offset
+from the linker-provided `_end` symbol.  In this way, the `userboot`
+code can make direct calls to each vDSO entry point in the exact
+location it will appear in memory after the `userboot` image itself.
+
+## userboot decompresses BOOTFS
+
+The first thing `userboot` does is to read the bootstrap message sent by
+the kernel.  Among the handles it gets from the kernel is one with
+*handle info entry* `PA_HND(PA_VMO_BOOTDATA, 0)`.  This is
+a [VMO](objects/vm_object.md) containing the `BOOTDATA` blob from the
+boot loader.  `userboot` reads the `BOOTDATA` headers from this VMO
+looking for the first item with type `BOOTDATA_BOOTFS_BOOT`.  That
+contains the [BOOTFS](#BOOTFS) image.  The item's `BOOTDATA` header
+indicates if it's compressed, which it usually is.  `userboot` maps in
+this portion of the VMO.  `userboot` contains LZ4 format support code,
+which it uses to decompress the item into a fresh VMO.
+
+## userboot loads the first "real" user process from BOOTFS
+
+Next, `userboot` examines the environment strings it received from the
+kernel, which represent the kernel command line.  If there is a string
+`userboot=`*file* then *file* will be loaded as the first real user
+process.  If no such option is present, the default *file* is `bin/devmgr`.
+The files are found in the BOOTFS image.
+
+To load the file, `userboot` implements a full-featured ELF program loader.
+Usually the file being loaded is a dynamically-linked executable with a
+`PT_INTERP` program header.  In this case, `userboot` looks for the file
+named in `PT_INTERP` and loads that instead.
+
+Then `userboot` loads the vDSO at a random address.  It starts the new
+process with the standard conventions, passing it a channel handle and the
+vDSO base address.  On that channel, `userboot` sends the
+standard [`processargs`](program_loading.md#the-processargs-protocol)
+messages.  It passes on all the important handles it received from the
+kernel (replacing specific handles such as the process-self and thread-self
+handles with those for the new process rather than for `userboot` itself).
+
+## userboot loader service
+
+Following the standard program loading protocol, when `userboot` loads a
+program via `PT_INTERP`, it sends an additional `processargs` message
+before the main message, intended for the use of the dynamic linker.  This
+message includes a `PA_SVC_LOADER` handle for a channel on which `userboot`
+provides a minimal implementation of the
+standard [loader service](program_loading.md#the-loader-service).
+
+`userboot` has only a single thread, which remains in a loop handling
+loader service requests until the channel is closed.  When it receives a
+`LOADER_SVC_OP_LOAD_OBJECT` request, it looks up the object name prefixed
+by `lib/` as a file in BOOTFS and returns a VMO of its contents.  Thus, the
+first "real" user process can be (and usually is) a dynamically linked
+executable needing various shared libraries.  The dynamic linker, the
+executable, and the shared libraries are all loaded from the same BOOTFS
+pages that will later appear as files in `/boot`.
+
+An executable that will be loaded by `userboot` (i.e. `devmgr`) should
+normally close its loader service channel once it's completed startup.
+That lets `userboot` know that it's no longer needed.
+
+## userboot rides off into the sunset
+
+When the loader service channel is closed (or if the executable had no
+`PT_INTERP` and so no loader service was required, then as soon as the
+process has been started), `userboot` no longer has anything to do.
+
+If [the `userboot.shutdown` option was given on the kernel command line](kernel_cmdline.md#userboot_shutdown),
+then `userboot` waits for the process it started to exit, and then shuts
+down the system (as if by the `dm shutdown` command).  This can be useful
+to run a single test program and then shut down the machine (or emulator).
+For example, the command line `userboot=bin/core-tests userboot.shutdown`
+runs the Magenta core tests and then shuts down.
+
+Otherwise, `userboot` does not wait for the process to exit.  `userboot`
+exits immediately, leaving the first "real" user process in charge of
+bringing up and taking down the rest of the system.
@@ -0,0 +1,283 @@
+# Magenta vDSO
+
+The Magenta vDSO is the sole means of access to [system calls](syscalls.md)
+in Magenta.  vDSO stands for *virtual Dynamic Shared Object*.  (*Dynamic
+Shared Object* is a term used for a shared library in the ELF format.)
+It's *virtual* because it's not loaded from an ELF file that sits in a
+filesystem.  Instead, the vDSO image is provided directly by the kernel.
+
+[TOC]
+
+## Using the vDSO
+
+### System Call ABI
+
+The vDSO is a shared library in the ELF format.  It's used in the normal
+way that ELF shared libraries are used, which is to look up entry points by
+symbol name in the ELF *dynamic symbol table* (the `.dynsym` section,
+located via `DT_SYMTAB`).  ELF defines a hash table format to optimize
+lookup by name in the symbol table (the `.hash` section, located via
+`DT_HASH`); GNU tools have defined an improved hash table format that makes
+lookups much more efficient (the `.gnu_hash` section, located via
+`DT_GNU_HASH`).  Fuchsia ELF shared libraries, including the vDSO, use the
+`DT_GNU_HASH` format exclusively.  (It's also possible to use the symbol
+table directly via linear search, ignoring the hash table.)
+
+The vDSO uses a [simplified layout](#Read_Only-Dynamic-Shared-Object-Layout)
+that has no writable segment and requires no dynamic relocations.  This
+makes it easier to use the system call ABI without implementing a
+general-purpose ELF loader and full ELF dynamic linking semantics.
+
+ELF symbol names are the same as C identifiers with external linkage.
+Each [system call](syscalls.md) corresponds to an ELF symbol in the vDSO,
+and has the ABI of a C function.  The vDSO functions use only the basic
+machine-specific C calling conventions governing the use of machine
+registers and the stack, which is common across many systems that use ELF,
+such as Linux and all the BSD variants.  They do not rely on complex
+features such as ELF Thread-Local Storage, nor on Fuchsia-specific ABI
+elements such as the [SafeStack](safestack.md) unsafe stack pointer.
+
+### vDSO Unwind Information
+
+The vDSO has an ELF program header of type `PT_GNU_EH_FRAME`.  This points
+to unwind information in the GNU `.eh_frame` format, which is a close
+relative of the standard DWARF Call Frame Information format.  This
+information makes it possible to recover the register values from call
+frames in the vDSO code, so that a complete stack trace can be reconstructed
+from any thread's register state with a PC value inside the vDSO code.
+These formats and their use are just the same in the vDSO as they are in any
+normal ELF shared library on Fuchsia or other systems using common GNU ELF
+extensions, such as Linux and all the BSD variants.
+
+### vDSO Build ID
+
+The vDSO has an ELF *Build ID*, as other ELF shared libraries and
+executables built with common GNU extensions do.  The Build ID is a unique
+bit string that identifies a specific build of that binary.  This is stored
+in ELF note format, pointed to by an ELF program header of type `PT_NOTE`.
+The payload of the note with name `"GNU"` and type `NT_GNU_BUILD_ID` is a
+sequence of bytes that constitutes the Build ID.
+
+One main use of Build IDs is to associate binaries with their debugging
+information and the source code they were built from.  The vDSO binary is
+innately tied to (and embedded within) the kernel binary and includes
+information specific to each kernel build, so the Build ID of the vDSO
+distinguishes kernels as well.
+
+### **process_start**() argument
+
+The [**process_start**()](syscalls/process_start.md) system call is how a
+program loader tells the kernel to start a new process's first thread
+executing.  The final argument (`arg2`
+in [the **process_start**() documentation](syscalls/process_start.md)) is a
+plain `uintptr_t` value passed to the new thread in a register.
+
+By convention, the program loader maps the vDSO into each new process's
+address space (at a random location chosen by the system) and passes the
+base address of the image to the new process's first thread in the `arg2`
+register.  This address is where the ELF file header can be found in memory,
+pointing to all the other ELF format elements necessary to look up symbol
+names and thus make system calls.
+
+### **PA_VMO_VDSO** handle
+
+The vDSO image is embedded in the kernel at compile time.  The kernel
+exposes it to userspace as a read-only [VMO](objects/vm_object.md).
+
+When a program loader sets up a new process, the only way to make it
+possible for that process to make system calls is for the program loader to
+map the vDSO into the new process's address space before its first thread
+starts running.  Hence, each process that will launch other processes
+capable of making system calls must have access to the vDSO VMO.
+
+By convention, a VMO handle for the vDSO is passed from process to process
+in the `mx_proc_args_t` bootstrap message sent to each new process
+(see [`<magenta/processargs.h>`](../system/public/magenta/processargs.h)).
+The VMO handle's entry in the handle table is identified by the *handle
+info entry* `PA_HND(PA_VMO_VDSO, 0)`.
+
+## vDSO Implementation Details
+
+### **sysgen** tool
+
+The [`sysgen` tool](../system/host/sysgen/) generates both C/C++ function
+declarations that form the public [system call](syscalls.md) API, and some
+C++ and assembly code used in the implementation of the vDSO.  Both the
+public API and the private interface between the kernel and the vDSO code
+are specified by
+[`<magenta/syscalls.sysgen>`](../system/public/magenta/syscalls.sysgen),
+which is the input to `sysgen`.
+
+The `syscall` entries in `syscalls.sysgen` fall into the following groups,
+distinguished by the presence of attributes after the system call name:
+
+ * Entries with neither `vdsocall` nor `internal` are the simple cases
+   (which are the majority of the system calls) where the public API and
+   the private API are exactly the same.  These are implemented entirely
+   by generated code.  The public API functions have names prefixed by
+   `_mx_` and `mx_` (aliases).
+
+* `vdsocall` entries are simply declarations for the public API.
+  These functions are implemented by normal, hand-written C++ code found
+  in [`system/ulib/magenta/`](../system/ulib/magenta/).  Those source
+  files `#include "private.h"` and then define the C++ function for the
+  system call with its name prefixed by `_mx_`.  Finally, they use the
+  `VDSO_INTERFACE_FUNCTION` macro on the system call's name prefixed by
+  `mx_` (no leading underscore).  This implementation code can call the
+  C++ function for any other system call entry (whether a public
+  generated call, a public hand-written `vdsocall`, or an `internal`
+  generated call), but must use its private entry point alias, which has
+  the `VDSO_mx`_ prefix.  Otherwise the code is normal (minimal) C++,
+  but must be stateless and reentrant (use only its stack and registers).
+
+ * `internal` entries are declarations of a private API used only by the
+   vDSO implementation code to enter the kernel (i.e., by other functions
+   implementing `vdsocall` system calls).  These produce functions in the
+   vDSO implementation with the same C signature that would be declared in
+   the public API given the signature of the system call entry.  However,
+   instead of being named with the `_mx_` and `mx_` prefixes, these are
+   available only via `#include "private.h"` with `VDSO_mx_` prefixes.
+
+### Read-Only Dynamic Shared Object Layout
+
+The vDSO is a normal ELF shared library and can be treated like any
+other.  But it's intentionally kept to a small subset of what an ELF
+shared library in general is allowed to do.  This has several benefits:
+
+ * Mapping the ELF image into a process is straightforward and does not
+   involve any complex corner cases of general support for ELF `PT_LOAD`
+   program headers.  The vDSO's layout can be handled by special-case
+   code with no loops that reads only a few values from ELF headers.
+ * Using the vDSO does not require full-featured ELF dynamic linking.
+   In particular, the vDSO has no dynamic relocations.  Mapping in the
+   ELF `PT_LOAD` segments is the only setup that needs to be done.
+ * The vDSO code is stateless and reentrant.  It refers only to the
+   registers and stack with which it's called.  This makes it usable in
+   a wide variety of contexts with minimal constraints on how user code
+   organizes itself, which is appropriate for the mandatory ABI of an
+   operating system.  It also makes the code easier to reason about and
+   audit for robustness and security.
+
+The layout is simply two consecutive segments, each containing aligned
+whole pages:
+
+ 1. The first segment is read-only, and includes the ELF headers and
+    metadata for dynamic linking along with constant data private to the
+    vDSO's implementation.
+ 2. The second segment is executable, containing the vDSO code.
+
+The whole vDSO image consists of just these two segments' pages, present
+in the ELF image just as they should appear in memory.  To map in the
+vDSO requires only two values gleaned from the vDSO's ELF headers: the
+number of pages in each segment.
+
+### Boot-time Read-Only Data
+
+Some system calls simply return values that are constant throughout the
+runtime of the whole system, though the ABI of the system is that their
+values must be queried at runtime and cannot be compiled into user code.
+These values either are fixed in the kernel at compile time or are
+determined by the kernel at boot time from hardware or boot parameters.
+Examples include [**system_get_version**()](syscalls/system_get_version.md),
+[**system_get_num_cpus**()](syscalls/system_get_num_cpus.md), and
+[**ticks_per_second**()](syscalls/ticks_per_second.md).
+The last example is influenced by
+a [kernel command line option](kernel_cmdline.md#vdso_soft_ticks_bool).
+
+Because these values are constant, there is no need to pay the overhead
+of entering the kernel to read them.  Instead, the vDSO implementations
+of these are simple C++ functions that just return constants read from
+the vDSO's read-only data segment.  Values fixed at compile time (such
+as the system version string) are simply compiled into the vDSO.
+
+For the values determined at boot time, the kernel must modify the
+contents of the vDSO.  This is accomplished by the boot-time code that
+sets up the vDSO VMO, before it starts the first userspace process and
+gives it the VMO handle.  At compile time, the offset into the vDSO
+image of
+the [`vdso_constants`](../kernel/lib/vdso/include/lib/vdso-constants.h)
+data structure is extracted from the vDSO ELF file that will be embedded
+in the kernel.  At boot time, the kernel temporarily maps the pages of
+the VMO covering `vdso_constants` into its own address space long enough
+to initialize the structure with the right values for the current run of
+the system.
+
+### Enforcement
+
+The vDSO entry points are the only means to enter the kernel for system
+calls.  The machine-specific instructions used to enter the kernel
+(e.g. `syscall` on x86) are not part of the system ABI and it's invalid
+for user code to execute such instructions directly.  The interface
+between the kernel and the vDSO code is a private implementation detail.
+
+Because the vDSO is itself normal code that executes in userspace, the
+kernel must robustly handle all possible entries into kernel mode from
+userspace.  However, potential kernel bugs can be mitigated somewhat by
+enforcing that each kernel entry be made only from the proper vDSO code.
+This enforcement also avoids developers of userspace code circumventing
+the ABI rules (because of ignorance, malice, or misguided intent to work
+around some perceived limitation of the official ABI), which could lead
+to the private kernel-vDSO interface becoming a *de facto* ABI for
+application code.
+
+The kernel enforces correct use of the vDSO in two ways:
+
+ 1. It constrains how the vDSO VMO can be mapped into a process.
+
+    When a [**vmar_map**()](syscalls/vmar_map.md) call is made using the
+    vDSO VMO and requesting `MX_VM_FLAG_PERM_EXECUTE`, the kernel
+    requires that the offset and size of the mapping exactly match the
+    vDSO's executable segment.  It also allows only one such mapping.
+    Once the valid vDSO mapping has been established in a process, it
+    cannot be removed.  Attempts to map the vDSO a second time into the
+    same process, to unmap the vDSO code from a process, or to make an
+    executable mapping of the vDSO that don't use the correct offset and
+    size, fail with `MX_ERR_ACCESS_DENIED`.
+
+    At compile time, the offset and size of the vDSO's code segment are
+    extracted from the vDSO ELF file and used as constants in the
+    kernel's mapping enforcement code.
+
+    When the one valid vDSO mapping is established in a process, the
+    kernel records the address for that process so it can be checked
+    quickly.
+
+ 2. It constrains what PC locations can be used to enter the kernel.
+
+    When a user thread enters the kernel for a system call, a register
+    indicates which low-level system call is being invoked.  The
+    low-level system calls are the private interface between the kernel
+    and the vDSO; many correspond directly the system calls in the
+    public ABI, but others do not.
+
+    For each low-level system call, there is a fixed set of PC locations
+    in the vDSO code that invoke that call.  The source code for the
+    vDSO defines internal symbols identifying each such location.  At
+    compile time, these locations are extracted from the vDSO's symbol
+    table and used to generate kernel code that defines a PC validity
+    predicate for each low-level system call.  Since there is only one
+    definition of the vDSO code used by all user processes in the
+    system, these predicates simply check for known, valid, constant
+    offsets from the beginning of the vDSO code segment.
+
+    On entry to the kernel for a system call, the kernel examines the PC
+    location of the `syscall` instruction on x86 (or equivalent
+    instruction on other machines).  It subtracts the base address of
+    the vDSO code recorded for the process at **vmar_map**() time from
+    the PC, and passes the resulting offset to the validity predicate
+    for the system call being invoked.  If the predicate rules the PC
+    invalid, the calling thread is not allowed to proceed with the
+    system call and instead takes a synthetic exception similar to the
+    machine exception that would result from invoking an undefined or
+    privileged machine instruction.
+
+### Variants
+
+**TODO(mcgrathr)**: vDSO *variants* are an experimental feature that is
+not yet in real use.  There is a proof-of-concept implementation and
+simple tests, but more work is required to implement the concept
+robustly and determine what variants will be made available.  The
+concept is to provide variants of the vDSO image that export only a
+subset of the full vDSO system call interface.  For example, system
+calls intended only for use by device drivers might be elided from the
+vDSO variant used for normal application code.
@@ -5,20 +5,19 @@
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

-#include <stdio.h>
 #include <app.h>
-#include <magenta/compiler.h>
 #include <kernel/thread.h>
+#include <magenta/compiler.h>
+#include <stdio.h>

 extern const struct app_descriptor __start_apps[] __WEAK;
 extern const struct app_descriptor __stop_apps[] __WEAK;

-static void start_app(const struct app_descriptor *app);
+static void start_app(const struct app_descriptor* app);

 /* one time setup */
-void apps_init(void)
-{
-    const struct app_descriptor *app;
+void apps_init() {
+    const struct app_descriptor* app;

    /* call all the init routines */
    for (app = __start_apps; app != __stop_apps; app++) {
@@ -34,22 +33,19 @@ void apps_init(void)
    }
 }

-static int app_thread_entry(void *arg)
-{
-    const struct app_descriptor *app = (const struct app_descriptor *)arg;
+static int app_thread_entry(void* arg) {
+    const struct app_descriptor* app = (const struct app_descriptor*)arg;

    app->entry(app, NULL);

    return 0;
 }

-static void start_app(const struct app_descriptor *app)
-{
-    uint32_t stack_size = (app->flags & APP_FLAG_CUSTOM_STACK_SIZE) ? app->stack_size : DEFAULT_STACK_SIZE;
+static void start_app(const struct app_descriptor* app) {
+    size_t stack_size = (app->flags & APP_FLAG_CUSTOM_STACK_SIZE) ? app->stack_size : DEFAULT_STACK_SIZE;

    printf("starting app %s\n", app->name);
-    thread_t *t = thread_create(app->name, &app_thread_entry, (void *)app, DEFAULT_PRIORITY, stack_size);
+    thread_t* t = thread_create(app->name, &app_thread_entry, (void*)app, DEFAULT_PRIORITY, stack_size);
    thread_detach(t);
    thread_resume(t);
 }
-
@@ -10,6 +10,6 @@ LOCAL_DIR := $(GET_LOCAL_DIR)
 MODULE := $(LOCAL_DIR)

 MODULE_SRCS += \
-	$(LOCAL_DIR)/app.c
+	$(LOCAL_DIR)/app.cpp

 include make/module.mk
@@ -15,6 +15,6 @@ MODULE_DEPS += \
 	kernel/lib/console

 MODULE_SRCS += \
-	$(LOCAL_DIR)/shell.c
+	$(LOCAL_DIR)/shell.cpp

 include make/module.mk
@@ -9,18 +9,12 @@
 #include <debug.h>
 #include <lib/console.h>

-static void shell_init(const struct app_descriptor *app)
-{
+static void shell_init(const struct app_descriptor* app) {
    console_init();
 }

-static void shell_entry(const struct app_descriptor *app, void *args)
-{
+static void shell_entry(const struct app_descriptor* app, void* args) {
    console_start();
 }

-APP_START(shell)
-.init = shell_init,
- .entry = shell_entry,
-  APP_END
-
+APP(shell, shell_init, shell_entry);
@@ -1,17 +0,0 @@
-# Copyright 2016 The Fuchsia Authors
-# Copyright (c) 2008-2015 Travis Geiselbrecht
-#
-# Use of this source code is governed by a MIT-style
-# license that can be found in the LICENSE file or at
-# https://opensource.org/licenses/MIT
-
-LOCAL_DIR := $(GET_LOCAL_DIR)
-
-MODULE := $(LOCAL_DIR)
-
-MODULE_SRCS += \
-	$(LOCAL_DIR)/string_tests.c \
-
-# put arch local .S files here if developing memcpy/memmove
-
-include make/module.mk
@@ -6,19 +6,19 @@

 #include "tests.h"

-#include <mxtl/alloc_checker.h>
-#include <mxtl/unique_ptr.h>
+#include <fbl/alloc_checker.h>
+#include <fbl/unique_ptr.h>
 #include <unittest.h>

 static bool alloc_checker_ctor(void* context) {
    BEGIN_TEST;

    {
-        mxtl::AllocChecker ac;
+        fbl::AllocChecker ac;
    }

    {
-        mxtl::AllocChecker ac;
+        fbl::AllocChecker ac;
        ac.check();
    }

@@ -28,7 +28,7 @@ static bool alloc_checker_ctor(void* context) {
 static bool alloc_checker_basic(void* context) {
    BEGIN_TEST;

-    mxtl::AllocChecker ac;
+    fbl::AllocChecker ac;
    ac.arm(8u, true);
    EXPECT_TRUE(ac.check(), "");

@@ -44,12 +44,12 @@ static bool alloc_checker_basic(void* context) {

 static bool alloc_checker_panic(void* context) {
    BEGIN_TEST;
-    // Enable any of the blocks below to test the possible panics.
+// Enable any of the blocks below to test the possible panics.

 #if 0
    // Arm but not check should panic (true).
    {
-        mxtl::AllocChecker ac;
+        fbl::AllocChecker ac;
        ac.arm(24u, true);
    }
 #endif
@@ -57,7 +57,7 @@ static bool alloc_checker_panic(void* context) {
 #if 0
    // Arm but not check should panic (false).
    {
-        mxtl::AllocChecker ac;
+        fbl::AllocChecker ac;
        ac.arm(24u, false);
    }
 #endif
@@ -65,7 +65,7 @@ static bool alloc_checker_panic(void* context) {
 #if 0
    // Arming twice without a check should panic.
    {
-        mxtl::AllocChecker ac;
+        fbl::AllocChecker ac;
        ac.arm(24u, true);
        ac.arm(18u, true);
    }
@@ -77,8 +77,8 @@ static bool alloc_checker_panic(void* context) {
 static bool alloc_checker_new(void* context) {
    BEGIN_TEST;

-    mxtl::AllocChecker ac;
-    mxtl::unique_ptr<char[]> arr(new (&ac) char[128]);
+    fbl::AllocChecker ac;
+    fbl::unique_ptr<char[]> arr(new (&ac) char[128]);
    EXPECT_EQ(ac.check(), true, "");

    END_TEST;
@@ -93,7 +93,7 @@ struct BigStruct {
 static bool alloc_checker_oom(void* context) {
    BEGIN_TEST;

-    mxtl::AllocChecker ac;
+    fbl::AllocChecker ac;
    for (int ix = 0; ix != 100; ++ix) {
        auto bs = new (&ac) BigStruct;
        if (!ac.check()) {
@@ -107,9 +107,9 @@ static bool alloc_checker_oom(void* context) {
 }

 UNITTEST_START_TESTCASE(alloc_checker)
-UNITTEST("alloc checker ctor & dtor",   alloc_checker_ctor)
-UNITTEST("alloc checker basic",         alloc_checker_basic)
-UNITTEST("alloc checker panic",         alloc_checker_panic)
-UNITTEST("alloc checker new",           alloc_checker_new)
-UNITTEST("alloc_checker out of mem",    alloc_checker_oom)
+UNITTEST("alloc checker ctor & dtor", alloc_checker_ctor)
+UNITTEST("alloc checker basic", alloc_checker_basic)
+UNITTEST("alloc checker panic", alloc_checker_panic)
+UNITTEST("alloc checker new", alloc_checker_new)
+UNITTEST("alloc_checker out of mem", alloc_checker_oom)
 UNITTEST_END_TESTCASE(alloc_checker, "alloc_cpp", "Tests of the C++ AllocChecker", nullptr, nullptr);
@@ -7,31 +7,34 @@

 #include "tests.h"

-#include <sys/types.h>
-#include <stdio.h>
-#include <rand.h>
+#include <arch/ops.h>
 #include <err.h>
+#include <inttypes.h>
+#include <kernel/mp.h>
+#include <kernel/mutex.h>
+#include <kernel/spinlock.h>
+#include <kernel/thread.h>
+#include <platform.h>
+#include <rand.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <kernel/thread.h>
-#include <kernel/spinlock.h>
-#include <kernel/mutex.h>
-#include <platform.h>
-#include <arch/ops.h>
-#include <inttypes.h>
+#include <sys/types.h>

-const size_t BUFSIZE = (1024*1024);
-const uint ITER = 1024;
+const size_t BUFSIZE = (8 * 1024 * 1024);
+const size_t ITER = (1UL * 1024 * 1024 * 1024 / BUFSIZE); // enough iterations to have to copy/set 1GB of memory

-__NO_INLINE static void bench_set_overhead(void)
-{
-    uint32_t *buf = malloc(BUFSIZE);
+__NO_INLINE static void bench_set_overhead() {
+    uint32_t* buf = (uint32_t*)malloc(BUFSIZE);

+    spin_lock_saved_state_t state;
+    arch_interrupt_save(&state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);
    uint64_t count = arch_cycle_count();
-    for (uint i = 0; i < ITER; i++) {
+    for (size_t i = 0; i < ITER; i++) {
        __asm__ volatile("");
    }
    count = arch_cycle_count() - count;
+    arch_interrupt_restore(state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);

    printf("took %" PRIu64 " cycles overhead to loop %u times\n",
           count, ITER);
@@ -39,15 +42,17 @@ __NO_INLINE static void bench_set_overhead(void)
    free(buf);
 }

-__NO_INLINE static void bench_memset(void)
-{
-    uint8_t *buf = memalign(PAGE_SIZE, BUFSIZE);
+__NO_INLINE static void bench_memset() {
+    uint8_t* buf = (uint8_t*)memalign(PAGE_SIZE, BUFSIZE);

+    spin_lock_saved_state_t state;
+    arch_interrupt_save(&state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);
    uint64_t count = arch_cycle_count();
-    for (uint i = 0; i < ITER; i++) {
+    for (size_t i = 0; i < ITER; i++) {
        memset(buf, 0, BUFSIZE);
    }
    count = arch_cycle_count() - count;
+    arch_interrupt_restore(state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);

    uint64_t bytes_cycle = (BUFSIZE * ITER * 1000ULL) / count;
    printf("took %" PRIu64 " cycles to memset a buffer of size %zu %d times (%zu bytes), %llu.%03llu bytes/cycle\n",
@@ -56,17 +61,19 @@ __NO_INLINE static void bench_memset(void)
    free(buf);
 }

-__NO_INLINE static void bench_memset_per_page(void)
-{
-    uint8_t *buf = memalign(PAGE_SIZE, BUFSIZE);
+__NO_INLINE static void bench_memset_per_page() {
+    uint8_t* buf = (uint8_t*)memalign(PAGE_SIZE, BUFSIZE);

+    spin_lock_saved_state_t state;
+    arch_interrupt_save(&state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);
    uint64_t count = arch_cycle_count();
-    for (uint i = 0; i < ITER; i++) {
-        for (uint j = 0; j < BUFSIZE; j += PAGE_SIZE) {
+    for (size_t i = 0; i < ITER; i++) {
+        for (size_t j = 0; j < BUFSIZE; j += PAGE_SIZE) {
            memset(buf + j, 0, PAGE_SIZE);
        }
    }
    count = arch_cycle_count() - count;
+    arch_interrupt_restore(state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);

    uint64_t bytes_cycle = (BUFSIZE * ITER * 1000ULL) / count;
    printf("took %" PRIu64 " cycles to per-page memset a buffer of size %zu %d times (%zu bytes), %llu.%03llu bytes/cycle\n",
@@ -75,17 +82,19 @@ __NO_INLINE static void bench_memset_per_page(void)
    free(buf);
 }

-__NO_INLINE static void bench_zero_page(void)
-{
-    uint8_t *buf = memalign(PAGE_SIZE, BUFSIZE);
+__NO_INLINE static void bench_zero_page() {
+    uint8_t* buf = (uint8_t*)memalign(PAGE_SIZE, BUFSIZE);

+    spin_lock_saved_state_t state;
+    arch_interrupt_save(&state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);
    uint64_t count = arch_cycle_count();
-    for (uint i = 0; i < ITER; i++) {
-        for (uint j = 0; j < BUFSIZE; j += PAGE_SIZE) {
+    for (size_t i = 0; i < ITER; i++) {
+        for (size_t j = 0; j < BUFSIZE; j += PAGE_SIZE) {
            arch_zero_page(buf + j);
        }
    }
    count = arch_cycle_count() - count;
+    arch_interrupt_restore(state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);

    uint64_t bytes_cycle = (BUFSIZE * ITER * 1000ULL) / count;
    printf("took %" PRIu64 " cycles to arch_zero_page a buffer of size %zu %d times (%zu bytes), %llu.%03llu bytes/cycle\n",
@@ -94,49 +103,48 @@ __NO_INLINE static void bench_zero_page(void)
    free(buf);
 }

-#define bench_cset(type) \
-__NO_INLINE static void bench_cset_##type(void) \
-{ \
-    type *buf = malloc(BUFSIZE); \
- \
-    uint64_t count = arch_cycle_count(); \
-    for (uint i = 0; i < ITER; i++) { \
-        for (uint j = 0; j < BUFSIZE / sizeof(*buf); j++) { \
-            buf[j] = 0; \
-        } \
-    } \
-    count = arch_cycle_count() - count; \
- \
-    uint64_t bytes_cycle = (BUFSIZE * ITER * 1000ULL) / count; \
-    printf("took %" PRIu64 " cycles to clear a buffer using wordsize %d of size %zu %d times (%zu bytes), %llu.%03llu bytes/cycle\n", \
-           count, sizeof(*buf), BUFSIZE, ITER, BUFSIZE * ITER, bytes_cycle / 1000, bytes_cycle % 1000); \
- \
-    free(buf); \
-}
-
-bench_cset(uint8_t)
-bench_cset(uint16_t)
-bench_cset(uint32_t)
-bench_cset(uint64_t)
-
-__NO_INLINE static void bench_cset_wide(void)
-{
-    uint32_t *buf = malloc(BUFSIZE);
+template <typename T>
+__NO_INLINE static void bench_cset() {
+    T* buf = (T*)malloc(BUFSIZE);

+    spin_lock_saved_state_t state;
+    arch_interrupt_save(&state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);
    uint64_t count = arch_cycle_count();
-    for (uint i = 0; i < ITER; i++) {
-        for (uint j = 0; j < BUFSIZE / sizeof(*buf) / 8; j++) {
-            buf[j*8] = 0;
-            buf[j*8+1] = 0;
-            buf[j*8+2] = 0;
-            buf[j*8+3] = 0;
-            buf[j*8+4] = 0;
-            buf[j*8+5] = 0;
-            buf[j*8+6] = 0;
-            buf[j*8+7] = 0;
+    for (size_t i = 0; i < ITER; i++) {
+        for (size_t j = 0; j < BUFSIZE / sizeof(T); j++) {
+            buf[j] = 0;
        }
    }
    count = arch_cycle_count() - count;
+    arch_interrupt_restore(state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);
+
+    uint64_t bytes_cycle = (BUFSIZE * ITER * 1000ULL) / count;
+    printf("took %" PRIu64 " cycles to clear a buffer using wordsize %d of size %zu %d times (%zu bytes), %llu.%03llu bytes/cycle\n",
+           count, sizeof(*buf), BUFSIZE, ITER, BUFSIZE * ITER, bytes_cycle / 1000, bytes_cycle % 1000);
+
+    free(buf);
+}
+
+__NO_INLINE static void bench_cset_wide() {
+    uint32_t* buf = (uint32_t*)malloc(BUFSIZE);
+
+    spin_lock_saved_state_t state;
+    arch_interrupt_save(&state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);
+    uint64_t count = arch_cycle_count();
+    for (size_t i = 0; i < ITER; i++) {
+        for (size_t j = 0; j < BUFSIZE / sizeof(*buf) / 8; j++) {
+            buf[j * 8] = 0;
+            buf[j * 8 + 1] = 0;
+            buf[j * 8 + 2] = 0;
+            buf[j * 8 + 3] = 0;
+            buf[j * 8 + 4] = 0;
+            buf[j * 8 + 5] = 0;
+            buf[j * 8 + 6] = 0;
+            buf[j * 8 + 7] = 0;
+        }
+    }
+    count = arch_cycle_count() - count;
+    arch_interrupt_restore(state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);

    uint64_t bytes_cycle = (BUFSIZE * ITER * 1000ULL) / count;
    printf("took %" PRIu64 " cycles to clear a buffer of size %zu %d times 8 words at a time (%zu bytes), %llu.%03llu bytes/cycle\n",
@@ -145,15 +153,17 @@ __NO_INLINE static void bench_cset_wide(void)
    free(buf);
 }

-__NO_INLINE static void bench_memcpy(void)
-{
-    uint8_t *buf = calloc(1, BUFSIZE);
+__NO_INLINE static void bench_memcpy() {
+    uint8_t* buf = (uint8_t*)calloc(1, BUFSIZE);

+    spin_lock_saved_state_t state;
+    arch_interrupt_save(&state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);
    uint64_t count = arch_cycle_count();
-    for (uint i = 0; i < ITER; i++) {
+    for (size_t i = 0; i < ITER; i++) {
        memcpy(buf, buf + BUFSIZE / 2, BUFSIZE / 2);
    }
    count = arch_cycle_count() - count;
+    arch_interrupt_restore(state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);

    uint64_t bytes_cycle = (BUFSIZE / 2 * ITER * 1000ULL) / count;
    printf("took %" PRIu64 " cycles to memcpy a buffer of size %zu %d times (%zu source bytes), %llu.%03llu source bytes/cycle\n",
@@ -162,20 +172,19 @@ __NO_INLINE static void bench_memcpy(void)
    free(buf);
 }

-__NO_INLINE static void bench_spinlock(void)
-{
+__NO_INLINE static void bench_spinlock() {
    spin_lock_saved_state_t state;
    spin_lock_saved_state_t state2;
    spin_lock_t lock;

    spin_lock_init(&lock);

-#define COUNT (128*1024*1024)
+#define COUNT (128 * 1024 * 1024)
    // test 1: acquire/release a spinlock with interrupts already disabled
    arch_interrupt_save(&state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);

    uint64_t c = arch_cycle_count();
-    for (uint i = 0; i < COUNT; i++) {
+    for (size_t i = 0; i < COUNT; i++) {
        spin_lock(&lock);
        spin_unlock(&lock);
    }
@@ -189,7 +198,7 @@ __NO_INLINE static void bench_spinlock(void)
    arch_interrupt_save(&state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);

    c = arch_cycle_count();
-    for (uint i = 0; i < COUNT; i++) {
+    for (size_t i = 0; i < COUNT; i++) {
        spin_lock_irqsave(&lock, state2);
        spin_unlock_irqrestore(&lock, state2);
    }
@@ -201,7 +210,7 @@ __NO_INLINE static void bench_spinlock(void)

    // test 2: acquire/release a spinlock with irq save and irqs enabled
    c = arch_cycle_count();
-    for (uint i = 0; i < COUNT; i++) {
+    for (size_t i = 0; i < COUNT; i++) {
        spin_lock_irqsave(&lock, state2);
        spin_unlock_irqrestore(&lock, state2);
    }
@@ -211,14 +220,13 @@ __NO_INLINE static void bench_spinlock(void)
 #undef COUNT
 }

-__NO_INLINE static void bench_mutex(void)
-{
+__NO_INLINE static void bench_mutex() {
    mutex_t m;
    mutex_init(&m);

-    static const uint count = 128*1024*1024;
+    static const uint count = 128 * 1024 * 1024;
    uint64_t c = arch_cycle_count();
-    for (uint i = 0; i < count; i++) {
+    for (size_t i = 0; i < count; i++) {
        mutex_acquire(&m);
        mutex_release(&m);
    }
@@ -227,8 +235,7 @@ __NO_INLINE static void bench_mutex(void)
    printf("%" PRIu64 " cycles to acquire/release uncontended mutex %u times (%" PRIu64 " cycles per)\n", c, count, c / count);
 }

-void benchmarks(void)
-{
+void benchmarks() {
    bench_set_overhead();
    bench_memcpy();
    bench_memset();
@@ -236,13 +243,12 @@ void benchmarks(void)
    bench_memset_per_page();
    bench_zero_page();

-    bench_cset_uint8_t();
-    bench_cset_uint16_t();
-    bench_cset_uint32_t();
-    bench_cset_uint64_t();
+    bench_cset<uint8_t>();
+    bench_cset<uint16_t>();
+    bench_cset<uint32_t>();
+    bench_cset<uint64_t>();
    bench_cset_wide();

    bench_spinlock();
    bench_mutex();
 }
-
@@ -7,23 +7,22 @@

 #include "tests.h"

+#include <arch.h>
+#include <arch/ops.h>
 #include <inttypes.h>
+#include <lib/console.h>
+#include <platform.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <arch.h>
-#include <arch/ops.h>
-#include <lib/console.h>
-#include <platform.h>

-static void bench_cache(size_t bufsize, uint8_t *buf)
-{
+static void bench_cache(size_t bufsize, uint8_t* buf) {
    lk_time_t t;
    bool do_free;

    if (buf == 0) {
-        buf = memalign(PAGE_SIZE, bufsize);
+        buf = (uint8_t*)memalign(PAGE_SIZE, bufsize);
        do_free = true;
    } else {
        do_free = false;
@@ -52,18 +51,17 @@ static void bench_cache(size_t bufsize, uint8_t *buf)
    printf("took %" PRIu64 " nsecs to clean %zu bytes (hot)\n", t, bufsize);
 }

-static int cache_tests(int argc, const cmd_args *argv, uint32_t flags)
-{
-    uint8_t *buf;
-    buf = (uint8_t *)((argc > 1) ? argv[1].u : 0UL);
+static int cache_tests(int argc, const cmd_args* argv, uint32_t flags) {
+    uint8_t* buf;
+    buf = (uint8_t*)((argc > 1) ? argv[1].u : 0UL);

    printf("testing cache\n");

-    bench_cache(2*1024, buf);
-    bench_cache(64*1024, buf);
-    bench_cache(256*1024, buf);
-    bench_cache(1*1024*1024, buf);
-    bench_cache(8*1024*1024, buf);
+    bench_cache(2 * 1024, buf);
+    bench_cache(64 * 1024, buf);
+    bench_cache(256 * 1024, buf);
+    bench_cache(1 * 1024 * 1024, buf);
+    bench_cache(8 * 1024 * 1024, buf);
    return 0;
 }

@@ -7,18 +7,17 @@

 #include "tests.h"

-#include <stdio.h>
-#include <rand.h>
 #include <err.h>
 #include <inttypes.h>
-#include <kernel/thread.h>
-#include <kernel/mutex.h>
 #include <kernel/event.h>
 #include <kernel/mp.h>
+#include <kernel/mutex.h>
+#include <kernel/thread.h>
 #include <platform.h>
+#include <rand.h>
+#include <stdio.h>

-void clock_tests(void)
-{
+void clock_tests(void) {
    uint64_t c;
    lk_time_t t2;

@@ -42,7 +41,7 @@ void clock_tests(void)
                continue;
            }
            last = t2;
-            if (last - start > LK_MSEC(5))
+            if (last - start > LK_SEC(5))
                break;
        }
    }
@@ -62,7 +61,7 @@ void clock_tests(void)
        printf("measuring cpu clock against current_time() on cpu %u\n", cpu);

        thread_set_pinned_cpu(get_current_thread(), cpu);
-        mp_reschedule(1 << cpu, 0);
+        mp_reschedule(MP_IPI_TARGET_MASK, 1u << cpu, 0);
        thread_yield();

        for (int i = 0; i < 3; i++) {
@@ -76,6 +75,6 @@ void clock_tests(void)
    }

    thread_set_pinned_cpu(get_current_thread(), old_affinity);
-    mp_reschedule(MP_CPU_ALL_BUT_LOCAL, 0);
+    mp_reschedule(MP_IPI_TARGET_ALL_BUT_LOCAL, 0, 0);
    thread_yield();
 }
@@ -14,11 +14,10 @@
 #include <rand.h>
 #include <stdio.h>

-static int fibo_thread(void *argv)
-{
+static int fibo_thread(void* argv) {
    long fibo = (intptr_t)argv;

-    thread_t *t[2];
+    thread_t* t[2];

    if (fibo == 0)
        return 0;
@@ -27,15 +26,15 @@ static int fibo_thread(void *argv)

    char name[32];
    snprintf(name, sizeof(name), "fibo %lu", fibo - 1);
-    t[0] = thread_create(name, &fibo_thread, (void *)(fibo - 1), DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    t[0] = thread_create(name, &fibo_thread, (void*)(fibo - 1), DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
    if (!t[0]) {
-        printf("error creating thread for fibo %d\n", fibo-1);
+        printf("error creating thread for fibo %d\n", fibo - 1);
        return 0;
    }
    snprintf(name, sizeof(name), "fibo %lu", fibo - 2);
-    t[1] = thread_create(name, &fibo_thread, (void *)(fibo - 2), DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    t[1] = thread_create(name, &fibo_thread, (void*)(fibo - 2), DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
    if (!t[1]) {
-        printf("error creating thread for fibo %d\n", fibo-2);
+        printf("error creating thread for fibo %d\n", fibo - 2);
        thread_resume(t[0]);
        thread_join(t[0], NULL, INFINITE_TIME);
        return 0;
@@ -52,8 +51,7 @@ static int fibo_thread(void *argv)
    return retcode0 + retcode1;
 }

-int fibo(int argc, const cmd_args *argv)
-{
+int fibo(int argc, const cmd_args* argv) {

    if (argc < 2) {
        printf("not enough args\n");
@@ -62,7 +60,7 @@ int fibo(int argc, const cmd_args *argv)

    lk_time_t tim = current_time();

-    thread_t *t = thread_create("fibo", &fibo_thread, (void *)(uintptr_t)argv[1].u, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    thread_t* t = thread_create("fibo", &fibo_thread, (void*)(uintptr_t)argv[1].u, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
    thread_resume(t);

    int retcode;
@@ -10,8 +10,7 @@

 #include "float_test_vec.c"

-int main(void)
-{
+int main(void) {
    printf("floating point printf tests\n");

    for (size_t i = 0; i < float_test_vec_size; i++) {
@@ -20,4 +19,3 @@ int main(void)

    return 0;
 }
-
@@ -1,66 +0,0 @@
-// Copyright 2016 The Fuchsia Authors
-// Copyright (c) 2014 Travis Geiselbrecht
-//
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file or at
-// https://opensource.org/licenses/MIT
-
-
-#include <stdint.h>
-
-union double_int {
-    double d;
-    uint64_t i;
-};
-
-static const union double_int float_test_vec[] = {
-    { .d = -2.0 },
-    { .d = -1.0 },
-    { .d = -0.5 },
-    { .d = -0.0 },
-    { .d = 0.0 },
-    { .d = 0.01 },
-    { .d = 0.1 },
-    { .d = 0.2 },
-    { .d = 0.25 },
-    { .d = 0.5 },
-    { .d = 0.75 },
-    { .d = 1.0 },
-    { .d = 2.0 },
-    { .d = 3.0 },
-    { .d = 10.0 },
-    { .d = 100.0 },
-    { .d = 123456.0 },
-    { .d = -123456.0 },
-    { .d = 546.5645644531f },
-    { .d = -546.5645644531f },
-    { .d = 0.12345 },
-    { .d = 0.0000012345 },
-    { .d = 0.0000019999 },
-    { .d = 0.0000015 },
-    { .i = 0x4005bf0a8b145649ULL }, // e
-    { .i = 0x400921fb54442d18ULL }, // pi
-    { .i = 0x43f0000000000000ULL }, // 2^64
-    { .i = 0x7fefffffffffffffULL }, // largest normalized
-    { .i = 0x0010000000000000ULL }, // least positive normalized
-    { .i = 0x0000000000000001ULL }, // smallest possible denorm
-    { .i = 0x000fffffffffffffULL }, // largest possible denorm
-    { .i = 0x7ff0000000000001ULL }, // smallest SNAn
-    { .i = 0x7ff7ffffffffffffULL }, // largest SNAn
-    { .i = 0x7ff8000000000000ULL }, // smallest QNAn
-    { .i = 0x7fffffffffffffffULL }, // largest QNAn
-    { .i = 0xfff0000000000000ULL }, // -infinity
-    { .i = 0x7ff0000000000000ULL }, // +infinity
-};
-
-#define countof(a) (sizeof(a) / sizeof((a)[0]))
-static const unsigned int float_test_vec_size = countof(float_test_vec);
-
-#define PRINT_FLOAT \
-        printf("0x%016llx %f %F %a %A\n", \
-                float_test_vec[i], \
-                *(const double *)&float_test_vec[i], \
-                *(const double *)&float_test_vec[i], \
-                *(const double *)&float_test_vec[i], \
-                *(const double *)&float_test_vec[i])
-
@@ -0,0 +1,64 @@
+// Copyright 2016 The Fuchsia Authors
+// Copyright (c) 2014 Travis Geiselbrecht
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+
+#include <stdint.h>
+
+union double_int {
+    double d;
+    uint64_t i;
+};
+
+static const union double_int float_test_vec[] = {
+    {.d = -2.0},
+    {.d = -1.0},
+    {.d = -0.5},
+    {.d = -0.0},
+    {.d = 0.0},
+    {.d = 0.01},
+    {.d = 0.1},
+    {.d = 0.2},
+    {.d = 0.25},
+    {.d = 0.5},
+    {.d = 0.75},
+    {.d = 1.0},
+    {.d = 2.0},
+    {.d = 3.0},
+    {.d = 10.0},
+    {.d = 100.0},
+    {.d = 123456.0},
+    {.d = -123456.0},
+    {.d = 546.5645644531f},
+    {.d = -546.5645644531f},
+    {.d = 0.12345},
+    {.d = 0.0000012345},
+    {.d = 0.0000019999},
+    {.d = 0.0000015},
+    {.i = 0x4005bf0a8b145649ULL}, // e
+    {.i = 0x400921fb54442d18ULL}, // pi
+    {.i = 0x43f0000000000000ULL}, // 2^64
+    {.i = 0x7fefffffffffffffULL}, // largest normalized
+    {.i = 0x0010000000000000ULL}, // least positive normalized
+    {.i = 0x0000000000000001ULL}, // smallest possible denorm
+    {.i = 0x000fffffffffffffULL}, // largest possible denorm
+    {.i = 0x7ff0000000000001ULL}, // smallest SNAn
+    {.i = 0x7ff7ffffffffffffULL}, // largest SNAn
+    {.i = 0x7ff8000000000000ULL}, // smallest QNAn
+    {.i = 0x7fffffffffffffffULL}, // largest QNAn
+    {.i = 0xfff0000000000000ULL}, // -infinity
+    {.i = 0x7ff0000000000000ULL}, // +infinity
+};
+
+#define countof(a) (sizeof(a) / sizeof((a)[0]))
+static const unsigned int float_test_vec_size = countof(float_test_vec);
+
+#define PRINT_FLOAT                            \
+    printf("0x%016llx %f %F %a %A\n",          \
+           float_test_vec[i],                  \
+           *(const double*)&float_test_vec[i], \
+           *(const double*)&float_test_vec[i], \
+           *(const double*)&float_test_vec[i], \
+           *(const double*)&float_test_vec[i])
@@ -7,29 +7,28 @@

 #include "tests.h"

+#include <arch.h>
+#include <arch/ops.h>
+#include <debug.h>
+#include <err.h>
+#include <lib/console.h>
+#include <fbl/algorithm.h>
+#include <platform.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <err.h>
-#include <arch.h>
-#include <arch/ops.h>
-#include <lib/console.h>
-#include <kernel/vm/pmm.h>
-#include <kernel/vm/vm_aspace.h>
-#include <platform.h>
-#include <debug.h>
+#include <vm/pmm.h>
+#include <vm/vm_aspace.h>

-static void mem_test_fail(void *ptr, uint32_t should, uint32_t is)
-{
+static void mem_test_fail(void* ptr, uint32_t should, uint32_t is) {
    printf("ERROR at %p: should be 0x%x, is 0x%x\n", ptr, should, is);

-    ptr = (void *)ROUNDDOWN((uintptr_t)ptr, 64);
+    ptr = (void*)ROUNDDOWN((uintptr_t)ptr, 64);
    hexdump(ptr, 128);
 }

-static status_t do_pattern_test(void *ptr, size_t len, uint32_t pat)
-{
-    volatile uint32_t *vbuf32 = reinterpret_cast<volatile uint32_t *>(ptr);
+static status_t do_pattern_test(void* ptr, size_t len, uint32_t pat) {
+    volatile uint32_t* vbuf32 = reinterpret_cast<volatile uint32_t*>(ptr);
    size_t i;

    printf("\tpattern 0x%08x\n", pat);
@@ -39,7 +38,7 @@ static status_t do_pattern_test(void *ptr, size_t len, uint32_t pat)

    for (i = 0; i < len / 4; i++) {
        if (vbuf32[i] != pat) {
-            mem_test_fail((void *)&vbuf32[i], pat, vbuf32[i]);
+            mem_test_fail((void*)&vbuf32[i], pat, vbuf32[i]);
            return MX_ERR_INTERNAL;
        }
    }
@@ -47,9 +46,8 @@ static status_t do_pattern_test(void *ptr, size_t len, uint32_t pat)
    return MX_OK;
 }

-static status_t do_moving_inversion_test(void *ptr, size_t len, uint32_t pat)
-{
-    volatile uint32_t *vbuf32 = reinterpret_cast<volatile uint32_t *>(ptr);
+static status_t do_moving_inversion_test(void* ptr, size_t len, uint32_t pat) {
+    volatile uint32_t* vbuf32 = reinterpret_cast<volatile uint32_t*>(ptr);
    size_t i;

    printf("\tpattern 0x%08x\n", pat);
@@ -63,7 +61,7 @@ static status_t do_moving_inversion_test(void *ptr, size_t len, uint32_t pat)
    //printf("\t\tbottom up invert\n");
    for (i = 0; i < len / 4; i++) {
        if (vbuf32[i] != pat) {
-            mem_test_fail((void *)&vbuf32[i], pat, vbuf32[i]);
+            mem_test_fail((void*)&vbuf32[i], pat, vbuf32[i]);
            return MX_ERR_INTERNAL;
        }

@@ -73,19 +71,19 @@ static status_t do_moving_inversion_test(void *ptr, size_t len, uint32_t pat)
    /* repeat, walking from top down */
    //printf("\t\ttop down invert\n");
    for (i = len / 4; i > 0; i--) {
-        if (vbuf32[i-1] != ~pat) {
-            mem_test_fail((void *)&vbuf32[i-1], ~pat, vbuf32[i-1]);
+        if (vbuf32[i - 1] != ~pat) {
+            mem_test_fail((void*)&vbuf32[i - 1], ~pat, vbuf32[i - 1]);
            return MX_ERR_INTERNAL;
        }

-        vbuf32[i-1] = pat;
+        vbuf32[i - 1] = pat;
    }

    /* verify that we have the original pattern */
    //printf("\t\tfinal test\n");
    for (i = 0; i < len / 4; i++) {
        if (vbuf32[i] != pat) {
-            mem_test_fail((void *)&vbuf32[i], pat, vbuf32[i]);
+            mem_test_fail((void*)&vbuf32[i], pat, vbuf32[i]);
            return MX_ERR_INTERNAL;
        }
    }
@@ -93,13 +91,12 @@ static status_t do_moving_inversion_test(void *ptr, size_t len, uint32_t pat)
    return MX_OK;
 }

-static void do_mem_tests(void *ptr, size_t len)
-{
+static void do_mem_tests(void* ptr, size_t len) {
    size_t i;

    /* test 1: simple write address to memory, read back */
    printf("test 1: simple address write, read back\n");
-    volatile uint32_t *vbuf32 = reinterpret_cast<volatile uint32_t *>(ptr);
+    volatile uint32_t* vbuf32 = reinterpret_cast<volatile uint32_t*>(ptr);
    for (i = 0; i < len / 4; i++) {
        vbuf32[i] = static_cast<uint32_t>(i);
    }
@@ -119,7 +116,7 @@ static void do_mem_tests(void *ptr, size_t len)
        0xaaaaaaaa, 0x55555555,
    };

-    for (size_t p = 0; p < countof(pat); p++) {
+    for (size_t p = 0; p < fbl::count_of(pat); p++) {
        if (do_pattern_test(ptr, len, pat[p]) < 0)
            goto out;
    }
@@ -136,10 +133,9 @@ static void do_mem_tests(void *ptr, size_t len)

    /* test 3: moving inversion, patterns */
    printf("test 3: moving inversions with patterns\n");
-    for (size_t p = 0; p < countof(pat); p++) {
+    for (size_t p = 0; p < fbl::count_of(pat); p++) {
        if (do_moving_inversion_test(ptr, len, pat[p]) < 0)
            goto out;
-
    }
    // shift bits through 32bit word
    for (uint32_t p = 1; p != 0; p <<= 1) {
@@ -156,18 +152,17 @@ out:
    printf("done with tests\n");
 }

-static int mem_test(int argc, const cmd_args *argv, uint32_t flags)
-{
+static int mem_test(int argc, const cmd_args* argv, uint32_t flags) {
    if (argc < 2) {
        printf("not enough arguments\n");
-usage:
+    usage:
        printf("usage: %s <length>\n", argv[0].str);
        printf("usage: %s <base> <length>\n", argv[0].str);
        return -1;
    }

    if (argc == 2) {
-        void *ptr;
+        void* ptr;
        size_t len = argv[1].u;

        /* rounding up len to the next page */
@@ -179,8 +174,8 @@ usage:

        /* allocate a region to test in */
        status_t err = VmAspace::kernel_aspace()->AllocContiguous(
-                "memtest", len, &ptr, 0, VmAspace::VMM_FLAG_COMMIT,
-                ARCH_MMU_FLAG_UNCACHED | ARCH_MMU_FLAG_PERM_READ | ARCH_MMU_FLAG_PERM_WRITE);
+            "memtest", len, &ptr, 0, VmAspace::VMM_FLAG_COMMIT,
+            ARCH_MMU_FLAG_UNCACHED | ARCH_MMU_FLAG_PERM_READ | ARCH_MMU_FLAG_PERM_WRITE);
        if (err < 0) {
            printf("error %d allocating test region\n", err);
            return -1;
@@ -198,7 +193,7 @@ usage:
        /* free the test memory */
        VmAspace::kernel_aspace()->FreeRegion(reinterpret_cast<vaddr_t>(ptr));
    } else if (argc == 3) {
-        void *ptr = argv[1].p;
+        void* ptr = argv[1].p;
        size_t len = argv[2].u;

        /* run the tests */
@@ -7,15 +7,14 @@

 #include "tests.h"

+#include <debug.h>
 #include <stdio.h>
 #include <string.h>
-#include <debug.h>

 #if !WITH_NO_FP
 #include "float_test_vec.c"

-static void printf_tests_float(void)
-{
+static void printf_tests_float(void) {
    printf("floating point printf tests\n");

    for (size_t i = 0; i < float_test_vec_size; i++) {
@@ -24,8 +23,7 @@ static void printf_tests_float(void)
 }
 #endif

-void printf_tests(void)
-{
+void printf_tests(void) {
    printf("printf tests\n");

    printf("numbers:\n");
@@ -120,4 +118,3 @@ void printf_tests(void)
    printf_tests_float();
 #endif
 }
-
@@ -10,24 +10,25 @@ LOCAL_DIR := $(GET_LOCAL_DIR)
 MODULE := $(LOCAL_DIR)

 MODULE_SRCS += \
-    $(LOCAL_DIR)/benchmarks.c \
-    $(LOCAL_DIR)/cache_tests.c \
-    $(LOCAL_DIR)/clock_tests.c \
-    $(LOCAL_DIR)/fibo.c \
+    $(LOCAL_DIR)/benchmarks.cpp \
+    $(LOCAL_DIR)/cache_tests.cpp \
+    $(LOCAL_DIR)/clock_tests.cpp \
+    $(LOCAL_DIR)/fibo.cpp \
    $(LOCAL_DIR)/mem_tests.cpp \
-    $(LOCAL_DIR)/printf_tests.c \
-    $(LOCAL_DIR)/sync_ipi_tests.c \
-    $(LOCAL_DIR)/sleep_tests.c \
-    $(LOCAL_DIR)/tests.c \
-    $(LOCAL_DIR)/thread_tests.c \
+    $(LOCAL_DIR)/printf_tests.cpp \
+    $(LOCAL_DIR)/sync_ipi_tests.cpp \
+    $(LOCAL_DIR)/sleep_tests.cpp \
+    $(LOCAL_DIR)/string_tests.c \
+    $(LOCAL_DIR)/tests.cpp \
+    $(LOCAL_DIR)/thread_tests.cpp \
    $(LOCAL_DIR)/alloc_checker_tests.cpp \
-    $(LOCAL_DIR)/timer_tests.c \
+    $(LOCAL_DIR)/timer_tests.cpp \


 MODULE_DEPS += \
    kernel/lib/crypto \
    kernel/lib/header_tests \
-    kernel/lib/mxtl \
+    kernel/lib/fbl \
    third_party/lib/safeint \
    kernel/lib/unittest \

@@ -6,14 +6,13 @@

 #include "tests.h"

-#include <stdio.h>
 #include <inttypes.h>
 #include <kernel/thread.h>
 #include <platform.h>
+#include <stdio.h>

 // Tests that thread_sleep and current_time() are consistent.
-static int thread_sleep_test(void)
-{
+static int thread_sleep_test(void) {
    int early = 0;
    for (int i = 0; i < 5; i++) {
        lk_time_t now = current_time();
@@ -27,7 +26,6 @@ static int thread_sleep_test(void)
    return early;
 }

-int sleep_tests(void)
-{
+int sleep_tests(void) {
    return thread_sleep_test();
 }
@@ -13,22 +13,26 @@
 #include <stdio.h>
 #include <string.h>

-static uint8_t *src;
-static uint8_t *dst;
+static uint8_t* src;
+static uint8_t* dst;

-static uint8_t *src2;
-static uint8_t *dst2;
+static uint8_t* src2;
+static uint8_t* dst2;

-#define BUFFER_SIZE (2*1024*1024)
-#define ITERATIONS (256*1024*1024 / BUFFER_SIZE) // enough iterations to have to copy/set 256MB of memory
+#define BUFFER_SIZE (8 * 1024 * 1024)
+#define ITERATIONS (1024 * 1024 * 1024 / BUFFER_SIZE) // enough iterations to have to copy/set 1GB of memory

 #if 1
-static inline void *mymemcpy(void *dst, const void *src, size_t len) { return memcpy(dst, src, len); }
-static inline void *mymemset(void *dst, int c, size_t len) { return memset(dst, c, len); }
+static inline void* mymemcpy(void* dst, const void* src, size_t len) {
+    return memcpy(dst, src, len);
+}
+static inline void* mymemset(void* dst, int c, size_t len) {
+    return memset(dst, c, len);
+}
 #else
 // if we're testing our own memcpy, use this
-extern void *mymemcpy(void *dst, const void *src, size_t len);
-extern void *mymemset(void *dst, int c, size_t len);
+extern void* mymemcpy(void* dst, const void* src, size_t len);
+extern void* mymemset(void* dst, int c, size_t len);
 #endif

 /* reference implementations of memmove/memcpy */
@@ -37,10 +41,9 @@ typedef long word;
 #define lsize sizeof(word)
 #define lmask (lsize - 1)

-static void *c_memmove(void *dest, void const *src, size_t count)
-{
-    char *d = (char *)dest;
-    const char *s = (const char *)src;
+static void* c_memmove(void* dest, void const* src, size_t count) {
+    char* d = (char*)dest;
+    const char* s = (const char*)src;
    int len;

    if (count == 0 || dest == src)
@@ -59,7 +62,7 @@ static void *c_memmove(void *dest, void const *src, size_t count)
                *d++ = *s++;
        }
        for (len = count / lsize; len > 0; len--) {
-            *(word *)d = *(word *)s;
+            *(word*)d = *(word*)s;
            d += lsize;
            s += lsize;
        }
@@ -82,7 +85,7 @@ static void *c_memmove(void *dest, void const *src, size_t count)
        for (len = count / lsize; len > 0; len--) {
            d -= lsize;
            s -= lsize;
-            *(word *)d = *(word *)s;
+            *(word*)d = *(word*)s;
        }
        for (len = count & lmask; len > 0; len--)
            *--d = *--s;
@@ -91,13 +94,12 @@ static void *c_memmove(void *dest, void const *src, size_t count)
    return dest;
 }

-static void *c_memset(void *s, int c, size_t count)
-{
-    char *xs = (char *) s;
+static void* c_memset(void* s, int c, size_t count) {
+    char* xs = (char*)s;
    size_t len = (-(size_t)s) & lmask;
    word cc = c & 0xff;

-    if ( count > len ) {
+    if (count > len) {
        count -= len;
        cc |= cc << 8;
        cc |= cc << 16;
@@ -105,12 +107,12 @@ static void *c_memset(void *s, int c, size_t count)
            cc |= (uint64_t)cc << 32; // should be optimized out on 32 bit machines

        // write to non-aligned memory byte-wise
-        for ( ; len > 0; len-- )
+        for (; len > 0; len--)
            *xs++ = c;

        // write to aligned memory dword-wise
-        for ( len = count / lsize; len > 0; len-- ) {
-            *((word *)xs) = (word)cc;
+        for (len = count / lsize; len > 0; len--) {
+            *((word*)xs) = (word)cc;
            xs += lsize;
        }

@@ -118,50 +120,50 @@ static void *c_memset(void *s, int c, size_t count)
    }

    // write remaining bytes
-    for ( ; count > 0; count-- )
+    for (; count > 0; count--)
        *xs++ = c;

    return s;
 }

-static void *null_memcpy(void *dst, const void *src, size_t len)
-{
+static void* null_memcpy(void* dst, const void* src, size_t len) {
    return dst;
 }

-static lk_time_t bench_memcpy_routine(void *memcpy_routine(void *, const void *, size_t), size_t srcalign, size_t dstalign)
-{
+static lk_time_t bench_memcpy_routine(void* memcpy_routine(void*, const void*, size_t), size_t srcalign, size_t dstalign) {
    int i;
    lk_time_t t0;

    t0 = current_time();
-    for (i=0; i < ITERATIONS; i++) {
+    for (i = 0; i < ITERATIONS; i++) {
        memcpy_routine(dst + dstalign, src + srcalign, BUFFER_SIZE);
    }
    return current_time() - t0;
 }

-static void bench_memcpy(void)
-{
+static void bench_memcpy(void) {
    lk_time_t null, c, libc, mine;
    size_t srcalign, dstalign;

    printf("memcpy speed test\n");
    thread_sleep_relative(LK_MSEC(200)); // let the debug string clear the serial port

-    for (srcalign = 0; srcalign < 64; ) {
-        for (dstalign = 0; dstalign < 64; ) {
+    for (srcalign = 0; srcalign < 64;) {
+        for (dstalign = 0; dstalign < 64;) {

+            spin_lock_saved_state_t state;
+            arch_interrupt_save(&state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);
            null = bench_memcpy_routine(&null_memcpy, srcalign, dstalign) / (1000 * 1000);
            c = bench_memcpy_routine(&c_memmove, srcalign, dstalign) / (1000 * 1000);
            libc = bench_memcpy_routine(&memcpy, srcalign, dstalign) / (1000 * 1000);
            mine = bench_memcpy_routine(&mymemcpy, srcalign, dstalign) / (1000 * 1000);
+            arch_interrupt_restore(state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);

            printf("srcalign %zu, dstalign %zu: ", srcalign, dstalign);
            printf("   null memcpy %" PRIu64 " msecs\n", null);
-            printf("c memcpy %" PRIu64 " msecs, %llu bytes/sec; ", c, (uint64_t)BUFFER_SIZE * ITERATIONS * 1000ULL / c);
-            printf("libc memcpy %" PRIu64 " msecs, %llu bytes/sec; ", libc, (uint64_t)BUFFER_SIZE * ITERATIONS * 1000ULL / libc);
-            printf("my memcpy %" PRIu64 " msecs, %llu bytes/sec; ", mine, (uint64_t)BUFFER_SIZE * ITERATIONS * 1000ULL / mine);
+            printf("c %" PRIu64 " msecs, %llu bytes/sec; ", c, (uint64_t)BUFFER_SIZE * ITERATIONS * 1000ULL / c);
+            printf("libc %" PRIu64 " msecs, %llu bytes/sec; ", libc, (uint64_t)BUFFER_SIZE * ITERATIONS * 1000ULL / libc);
+            printf("my %" PRIu64 " msecs, %llu bytes/sec; ", mine, (uint64_t)BUFFER_SIZE * ITERATIONS * 1000ULL / mine);
            printf("\n");

            if (dstalign < 8)
@@ -176,18 +178,16 @@ static void bench_memcpy(void)
    }
 }

-static void fillbuf(void *ptr, size_t len, uint32_t seed)
-{
+static void fillbuf(void* ptr, size_t len, uint32_t seed) {
    size_t i;

    for (i = 0; i < len; i++) {
-        ((char *)ptr)[i] = seed;
+        ((char*)ptr)[i] = seed;
        seed *= 0x1234567;
    }
 }

-static void validate_memcpy(void)
-{
+static void validate_memcpy(void) {
    size_t srcalign, dstalign, size;
    const size_t maxsize = 256;

@@ -222,20 +222,18 @@ static void validate_memcpy(void)
    }
 }

-static lk_time_t bench_memset_routine(void *memset_routine(void *, int, size_t), size_t dstalign, size_t len)
-{
+static lk_time_t bench_memset_routine(void* memset_routine(void*, int, size_t), size_t dstalign, size_t len) {
    int i;
    lk_time_t t0;

    t0 = current_time();
-    for (i=0; i < ITERATIONS; i++) {
+    for (i = 0; i < ITERATIONS; i++) {
        memset_routine(dst + dstalign, 0, len);
    }
    return current_time() - t0;
 }

-static void bench_memset(void)
-{
+static void bench_memset(void) {
    lk_time_t c, libc, mine;
    size_t dstalign;

@@ -244,20 +242,22 @@ static void bench_memset(void)

    for (dstalign = 0; dstalign < 64; dstalign++) {

+        spin_lock_saved_state_t state;
+        arch_interrupt_save(&state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);
        c = bench_memset_routine(&c_memset, dstalign, BUFFER_SIZE) / (1000 * 1000);
        libc = bench_memset_routine(&memset, dstalign, BUFFER_SIZE) / (1000 * 1000);
        mine = bench_memset_routine(&mymemset, dstalign, BUFFER_SIZE) / (1000 * 1000);
+        arch_interrupt_restore(state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);

        printf("dstalign %zu: ", dstalign);
-        printf("c memset %" PRIu64 " msecs, %llu bytes/sec; ", c, (uint64_t)BUFFER_SIZE * ITERATIONS * 1000ULL / c);
-        printf("libc memset %" PRIu64 " msecs, %llu bytes/sec; ", libc, (uint64_t)BUFFER_SIZE * ITERATIONS * 1000ULL / libc);
-        printf("my memset %" PRIu64 " msecs, %llu bytes/sec; ", mine, (uint64_t)BUFFER_SIZE * ITERATIONS * 1000ULL / mine);
+        printf("c %" PRIu64 " msecs, %llu bytes/sec; ", c, (uint64_t)BUFFER_SIZE * ITERATIONS * 1000ULL / c);
+        printf("libc %" PRIu64 " msecs, %llu bytes/sec; ", libc, (uint64_t)BUFFER_SIZE * ITERATIONS * 1000ULL / libc);
+        printf("my %" PRIu64 " msecs, %llu bytes/sec; ", mine, (uint64_t)BUFFER_SIZE * ITERATIONS * 1000ULL / mine);
        printf("\n");
    }
 }

-static void validate_memset(void)
-{
+static void validate_memset(void) {
    size_t dstalign, size;
    int c;
    const size_t maxsize = 256;
@@ -288,8 +288,7 @@ static void validate_memset(void)
 #if defined(WITH_LIB_CONSOLE)
 #include <lib/console.h>

-static int string_tests(int argc, const cmd_args *argv, uint32_t flags)
-{
+static int string_tests(int argc, const cmd_args* argv, uint32_t flags) {
    src = memalign(64, BUFFER_SIZE + 256);
    dst = memalign(64, BUFFER_SIZE + 256);
    src2 = memalign(64, BUFFER_SIZE + 256);
@@ -305,7 +304,7 @@ static int string_tests(int argc, const cmd_args *argv, uint32_t flags)

    if (argc < 3) {
        printf("not enough arguments:\n");
-usage:
+    usage:
        printf("%s validate <routine>\n", argv[0].str);
        printf("%s bench <routine>\n", argv[0].str);
        goto out;
@@ -341,6 +340,3 @@ STATIC_COMMAND("string", "memcpy tests", &string_tests)
 STATIC_COMMAND_END(stringtests);

 #endif
-
-APP_START(stringtests)
-APP_END
@@ -7,22 +7,22 @@

 #include "tests.h"

+#include <arch/ops.h>
 #include <assert.h>
 #include <err.h>
-#include <stdio.h>
-#include <trace.h>
-#include <arch/ops.h>
 #include <kernel/event.h>
 #include <kernel/mp.h>
 #include <kernel/thread.h>
+#include <stdio.h>
+#include <trace.h>

 #define LOCAL_TRACE 0

 #define TEST_RUNS 1000

-static void inorder_count_task(void *raw_context) {
+static void inorder_count_task(void* raw_context) {
    ASSERT(arch_ints_disabled());
-    int *inorder_counter = raw_context;
+    int* inorder_counter = (int*)raw_context;
    uint cpu_num = arch_curr_cpu_num();

    int oldval = atomic_add(inorder_counter, 1);
@@ -30,19 +30,19 @@ static void inorder_count_task(void *raw_context) {
    LTRACEF("  CPU %u checked in\n", cpu_num);
 }

-static void counter_task(void *raw_context) {
+static void counter_task(void* raw_context) {
    ASSERT(arch_ints_disabled());
-    int *counter = raw_context;
+    int* counter = (int*)raw_context;
    atomic_add(counter, 1);
 }

-static int deadlock_test_thread(void *arg) {
-    event_t *gate = arg;
+static int deadlock_test_thread(void* arg) {
+    event_t* gate = (event_t*)arg;
    event_wait(gate);

    int counter = 0;
    arch_disable_ints();
-    mp_sync_exec(MP_CPU_ALL_BUT_LOCAL, counter_task, &counter);
+    mp_sync_exec(MP_IPI_TARGET_ALL_BUT_LOCAL, 0, counter_task, &counter);
    arch_enable_ints();
    return 0;
 }
@@ -52,7 +52,7 @@ static void deadlock_test(void) {

    event_t gate = EVENT_INITIAL_VALUE(gate, false, 0);

-    thread_t *threads[5] = { 0 };
+    thread_t* threads[5] = {0};
    for (uint i = 0; i < countof(threads); ++i) {
        threads[i] = thread_create("sync_ipi_deadlock", deadlock_test_thread, &gate, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
        if (!threads[i]) {
@@ -73,8 +73,7 @@ cleanup:
    event_destroy(&gate);
 };

-int sync_ipi_tests(int argc, const cmd_args *argv)
-{
+int sync_ipi_tests(int argc, const cmd_args* argv) {
    uint num_cpus = arch_max_num_cpus();
    uint online = mp_get_online_mask();
    if (online != (1U << num_cpus) - 1) {
@@ -84,7 +83,7 @@ int sync_ipi_tests(int argc, const cmd_args *argv)

    uint runs = TEST_RUNS;
    if (argc > 1) {
-        runs = argv[1].u;
+        runs = (uint)argv[1].u;
    }

    /* Test that we're actually blocking and only signaling the ones we target */
@@ -92,7 +91,7 @@ int sync_ipi_tests(int argc, const cmd_args *argv)
        LTRACEF("Sequential test\n");
        int inorder_counter = 0;
        for (uint i = 0; i < num_cpus; ++i) {
-            mp_sync_exec(1 << i, inorder_count_task, &inorder_counter);
+            mp_sync_exec(MP_IPI_TARGET_MASK, 1u << i, inorder_count_task, &inorder_counter);
            LTRACEF("  Finished signaling CPU %u\n", i);
        }
    }
@@ -105,7 +104,7 @@ int sync_ipi_tests(int argc, const cmd_args *argv)
        spin_lock_saved_state_t irqstate;
        arch_interrupt_save(&irqstate, SPIN_LOCK_FLAG_INTERRUPTS);

-        mp_sync_exec(MP_CPU_ALL_BUT_LOCAL, counter_task, &counter);
+        mp_sync_exec(MP_IPI_TARGET_ALL_BUT_LOCAL, 0, counter_task, &counter);

        arch_interrupt_restore(irqstate, SPIN_LOCK_FLAG_INTERRUPTS);

@@ -7,7 +7,6 @@

 #include "tests.h"

-#include <app.h>
 #include <debug.h>
 #include <magenta/compiler.h>

@@ -32,13 +31,3 @@ STATIC_COMMAND("timer_tests", "tests timers", (console_cmd)&timer_tests)
 STATIC_COMMAND_END(tests);

 #endif
-
-static void tests_init(const struct app_descriptor *app)
-{
-}
-
-APP_START(tests)
-.init = tests_init,
-.flags = 0,
-APP_END
-
@@ -7,8 +7,8 @@

 #pragma once

-#include <magenta/compiler.h>
 #include <lib/console.h>
+#include <magenta/compiler.h>

 __BEGIN_CDECLS

@@ -19,19 +19,19 @@ void printf_tests(void);
 void clock_tests(void);
 void timer_tests(void);
 void benchmarks(void);
-int fibo(int argc, const cmd_args *argv);
-int spinner(int argc, const cmd_args *argv);
-int ref_counted_tests(int argc, const cmd_args *argv);
-int ref_ptr_tests(int argc, const cmd_args *argv);
-int unique_ptr_tests(int argc, const cmd_args *argv);
-int forward_tests(int argc, const cmd_args *argv);
-int list_tests(int argc, const cmd_args *argv);
-int hash_tests(int argc, const cmd_args *argv);
-int vm_tests(int argc, const cmd_args *argv);
-int auto_call_tests(int argc, const cmd_args *argv);
-int sync_ipi_tests(int argc, const cmd_args *argv);
-int arena_tests(int argc, const cmd_args *argv);
-int fifo_tests(int argc, const cmd_args *argv);
+int fibo(int argc, const cmd_args* argv);
+int spinner(int argc, const cmd_args* argv);
+int ref_counted_tests(int argc, const cmd_args* argv);
+int ref_ptr_tests(int argc, const cmd_args* argv);
+int unique_ptr_tests(int argc, const cmd_args* argv);
+int forward_tests(int argc, const cmd_args* argv);
+int list_tests(int argc, const cmd_args* argv);
+int hash_tests(int argc, const cmd_args* argv);
+int vm_tests(int argc, const cmd_args* argv);
+int auto_call_tests(int argc, const cmd_args* argv);
+int sync_ipi_tests(int argc, const cmd_args* argv);
+int arena_tests(int argc, const cmd_args* argv);
+int fifo_tests(int argc, const cmd_args* argv);
 int alloc_checker_tests(int argc, const cmd_args* argv);
 void unittests(void);

@@ -7,20 +7,19 @@

 #include "tests.h"

+#include <assert.h>
 #include <debug.h>
-#include <trace.h>
-#include <rand.h>
 #include <err.h>
 #include <inttypes.h>
-#include <assert.h>
-#include <string.h>
-#include <kernel/thread.h>
-#include <kernel/mutex.h>
 #include <kernel/event.h>
+#include <kernel/mutex.h>
+#include <kernel/thread.h>
 #include <platform.h>
+#include <rand.h>
+#include <string.h>
+#include <trace.h>

-static int sleep_thread(void *arg)
-{
+static int sleep_thread(void* arg) {
    for (;;) {
        printf("sleeper %p\n", get_current_thread());
        thread_sleep_relative(LK_MSEC(rand() % 500));
@@ -28,23 +27,21 @@ static int sleep_thread(void *arg)
    return 0;
 }

-static int sleep_test(void)
-{
+static int sleep_test(void) {
    int i;
-    for (i=0; i < 16; i++)
+    for (i = 0; i < 16; i++)
        thread_detach_and_resume(thread_create("sleeper", &sleep_thread, NULL, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
    return 0;
 }

-static int mutex_thread(void *arg)
-{
+static int mutex_thread(void* arg) {
    int i;
    const int iterations = 1000000;
    int count = 0;

-    static volatile int shared = 0;
+    static volatile uintptr_t shared = 0;

-    mutex_t *m = (mutex_t *)arg;
+    mutex_t* m = (mutex_t*)arg;

    printf("mutex tester thread %p starting up, will go for %d iterations\n", get_current_thread(), iterations);

@@ -72,8 +69,7 @@ static int mutex_thread(void *arg)
    return 0;
 }

-static int mutex_test(void)
-{
+static int mutex_test(void) {
    static mutex_t imutex = MUTEX_INITIAL_VALUE(imutex);
    printf("preinitialized mutex:\n");
    hexdump(&imutex, sizeof(imutex));
@@ -81,15 +77,15 @@ static int mutex_test(void)
    mutex_t m;
    mutex_init(&m);

-    thread_t *threads[5];
+    thread_t* threads[5];

-    for (uint i=0; i < countof(threads); i++) {
+    for (uint i = 0; i < countof(threads); i++) {
        threads[i] = thread_create("mutex tester", &mutex_thread, &m,
-                get_current_thread()->base_priority, DEFAULT_STACK_SIZE);
+                                   get_current_thread()->base_priority, DEFAULT_STACK_SIZE);
        thread_resume(threads[i]);
    }

-    for (uint i=0; i < countof(threads); i++) {
+    for (uint i = 0; i < countof(threads); i++) {
        thread_join(threads[i], NULL, INFINITE_TIME);
    }

@@ -102,24 +98,22 @@ static int mutex_test(void)

 static event_t e;

-static int event_signaler(void *arg)
-{
+static int event_signaler(void* arg) {
    printf("event signaler pausing\n");
    thread_sleep_relative(LK_SEC(1));

-//  for (;;) {
+    //  for (;;) {
    printf("signaling event\n");
    event_signal(&e, true);
    printf("done signaling event\n");
    thread_yield();
-//  }
+    //  }

    return 0;
 }

-static int event_waiter(void *arg)
-{
-    int count = (intptr_t)arg;
+static int event_waiter(void* arg) {
+    uintptr_t count = (uintptr_t)arg;

    while (count > 0) {
        printf("thread %p: waiting on event...\n", get_current_thread());
@@ -139,9 +133,8 @@ static int event_waiter(void *arg)
    return 0;
 }

-static void event_test(void)
-{
-    thread_t *threads[5];
+static void event_test(void) {
+    thread_t* threads[5];

    static event_t ievent = EVENT_INITIAL_VALUE(ievent, true, 0x1234);
    printf("preinitialized event:\n");
@@ -153,10 +146,10 @@ static void event_test(void)
    printf("creating event, waiting on it with 4 threads, signaling it and making sure all threads fall through twice\n");
    event_init(&e, false, 0);
    threads[0] = thread_create("event signaler", &event_signaler, NULL, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[1] = thread_create("event waiter 0", &event_waiter, (void *)2, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[2] = thread_create("event waiter 1", &event_waiter, (void *)2, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[3] = thread_create("event waiter 2", &event_waiter, (void *)2, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[4] = thread_create("event waiter 3", &event_waiter, (void *)2, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[1] = thread_create("event waiter 0", &event_waiter, (void*)2, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[2] = thread_create("event waiter 1", &event_waiter, (void*)2, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[3] = thread_create("event waiter 2", &event_waiter, (void*)2, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[4] = thread_create("event waiter 3", &event_waiter, (void*)2, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);

    for (uint i = 0; i < countof(threads); i++)
        thread_resume(threads[i]);
@@ -172,10 +165,10 @@ static void event_test(void)
    printf("creating event, waiting on it with 4 threads, signaling it and making sure only one thread wakes up\n");
    event_init(&e, false, EVENT_FLAG_AUTOUNSIGNAL);
    threads[0] = thread_create("event signaler", &event_signaler, NULL, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[1] = thread_create("event waiter 0", &event_waiter, (void *)99, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[2] = thread_create("event waiter 1", &event_waiter, (void *)99, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[3] = thread_create("event waiter 2", &event_waiter, (void *)99, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[4] = thread_create("event waiter 3", &event_waiter, (void *)99, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[1] = thread_create("event waiter 0", &event_waiter, (void*)99, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[2] = thread_create("event waiter 1", &event_waiter, (void*)99, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[3] = thread_create("event waiter 2", &event_waiter, (void*)99, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[4] = thread_create("event waiter 3", &event_waiter, (void*)99, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);

    for (uint i = 0; i < countof(threads); i++)
        thread_resume(threads[i]);
@@ -192,16 +185,14 @@ static void event_test(void)
    printf("event tests done\n");
 }

-static int quantum_tester(void *arg)
-{
+static int quantum_tester(void* arg) {
    for (;;) {
        printf("%p: in this thread. rq %" PRIu64 "\n", get_current_thread(), get_current_thread()->remaining_time_slice);
    }
    return 0;
 }

-static void quantum_test(void)
-{
+static void quantum_test(void) {
    thread_detach_and_resume(thread_create("quantum tester 0", &quantum_tester, NULL, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
    thread_detach_and_resume(thread_create("quantum tester 1", &quantum_tester, NULL, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
    thread_detach_and_resume(thread_create("quantum tester 2", &quantum_tester, NULL, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
@@ -211,12 +202,11 @@ static void quantum_test(void)
 static event_t context_switch_event;
 static event_t context_switch_done_event;

-static int context_switch_tester(void *arg)
-{
+static int context_switch_tester(void* arg) {
    int i;
    uint64_t total_count = 0;
    const int iter = 100000;
-    int thread_count = (intptr_t)arg;
+    uintptr_t thread_count = (uintptr_t)arg;

    event_wait(&context_switch_event);

@@ -234,12 +224,11 @@ static int context_switch_tester(void *arg)
    return 0;
 }

-static void context_switch_test(void)
-{
+static void context_switch_test(void) {
    event_init(&context_switch_event, false, 0);
    event_init(&context_switch_done_event, false, 0);

-    thread_detach_and_resume(thread_create("context switch idle", &context_switch_tester, (void *)1, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
+    thread_detach_and_resume(thread_create("context switch idle", &context_switch_tester, (void*)1, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
    thread_sleep_relative(LK_MSEC(100));
    event_signal(&context_switch_event, true);
    event_wait(&context_switch_done_event);
@@ -247,8 +236,8 @@ static void context_switch_test(void)

    event_unsignal(&context_switch_event);
    event_unsignal(&context_switch_done_event);
-    thread_detach_and_resume(thread_create("context switch 2a", &context_switch_tester, (void *)2, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
-    thread_detach_and_resume(thread_create("context switch 2b", &context_switch_tester, (void *)2, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
+    thread_detach_and_resume(thread_create("context switch 2a", &context_switch_tester, (void*)2, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
+    thread_detach_and_resume(thread_create("context switch 2b", &context_switch_tester, (void*)2, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
    thread_sleep_relative(LK_MSEC(100));
    event_signal(&context_switch_event, true);
    event_wait(&context_switch_done_event);
@@ -256,10 +245,10 @@ static void context_switch_test(void)

    event_unsignal(&context_switch_event);
    event_unsignal(&context_switch_done_event);
-    thread_detach_and_resume(thread_create("context switch 4a", &context_switch_tester, (void *)4, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
-    thread_detach_and_resume(thread_create("context switch 4b", &context_switch_tester, (void *)4, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
-    thread_detach_and_resume(thread_create("context switch 4c", &context_switch_tester, (void *)4, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
-    thread_detach_and_resume(thread_create("context switch 4d", &context_switch_tester, (void *)4, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
+    thread_detach_and_resume(thread_create("context switch 4a", &context_switch_tester, (void*)4, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
+    thread_detach_and_resume(thread_create("context switch 4b", &context_switch_tester, (void*)4, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
+    thread_detach_and_resume(thread_create("context switch 4c", &context_switch_tester, (void*)4, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
+    thread_detach_and_resume(thread_create("context switch 4d", &context_switch_tester, (void*)4, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE));
    thread_sleep_relative(LK_MSEC(100));
    event_signal(&context_switch_event, true);
    event_wait(&context_switch_done_event);
@@ -269,16 +258,15 @@ static void context_switch_test(void)
 static volatile int atomic;
 static volatile int atomic_count;

-static int atomic_tester(void *arg)
-{
-    int add = (intptr_t)arg;
+static int atomic_tester(void* arg) {
+    int add = (int)(uintptr_t)arg;
    int i;

    const int iter = 10000000;

    TRACEF("add %d, %d iterations\n", add, iter);

-    for (i=0; i < iter; i++) {
+    for (i = 0; i < iter; i++) {
        atomic_add(&atomic, add);
    }

@@ -288,22 +276,21 @@ static int atomic_tester(void *arg)
    return 0;
 }

-static void atomic_test(void)
-{
+static void atomic_test(void) {
    atomic = 0;
    atomic_count = 8;

    printf("testing atomic routines\n");

-    thread_t *threads[8];
-    threads[0] = thread_create("atomic tester 1", &atomic_tester, (void *)1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[1] = thread_create("atomic tester 1", &atomic_tester, (void *)1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[2] = thread_create("atomic tester 1", &atomic_tester, (void *)1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[3] = thread_create("atomic tester 1", &atomic_tester, (void *)1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[4] = thread_create("atomic tester 2", &atomic_tester, (void *)-1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[5] = thread_create("atomic tester 2", &atomic_tester, (void *)-1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[6] = thread_create("atomic tester 2", &atomic_tester, (void *)-1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
-    threads[7] = thread_create("atomic tester 2", &atomic_tester, (void *)-1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
+    thread_t* threads[8];
+    threads[0] = thread_create("atomic tester 1", &atomic_tester, (void*)1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[1] = thread_create("atomic tester 1", &atomic_tester, (void*)1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[2] = thread_create("atomic tester 1", &atomic_tester, (void*)1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[3] = thread_create("atomic tester 1", &atomic_tester, (void*)1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[4] = thread_create("atomic tester 2", &atomic_tester, (void*)-1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[5] = thread_create("atomic tester 2", &atomic_tester, (void*)-1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[6] = thread_create("atomic tester 2", &atomic_tester, (void*)-1, LOW_PRIORITY, DEFAULT_STACK_SIZE);
+    threads[7] = thread_create("atomic tester 2", &atomic_tester, (void*)-1, LOW_PRIORITY, DEFAULT_STACK_SIZE);

    /* start all the threads */
    for (uint i = 0; i < countof(threads); i++)
@@ -319,8 +306,7 @@ static void atomic_test(void)

 static volatile int preempt_count;

-static int preempt_tester(void *arg)
-{
+static int preempt_tester(void* arg) {
    spin(1000000);

    printf("exiting ts %" PRIu64 " ns\n", current_time());
@@ -330,8 +316,7 @@ static int preempt_tester(void *arg)
    return 0;
 }

-static void preempt_test(void)
-{
+static void preempt_test(void) {
    /* create 5 threads, let them run. If the system is properly timer preempting,
     * the threads should interleave each other at a fine enough granularity so
     * that they complete at roughly the same time. */
@@ -353,12 +338,11 @@ static void preempt_test(void)
     * complete in order, about a second apart. */
    printf("testing real time preemption\n");

-
    const int num_threads = 5;
    preempt_count = num_threads;

    for (int i = 0; i < num_threads; i++) {
-        thread_t *t = thread_create("preempt tester", &preempt_tester, NULL, LOW_PRIORITY, DEFAULT_STACK_SIZE);
+        thread_t* t = thread_create("preempt tester", &preempt_tester, NULL, LOW_PRIORITY, DEFAULT_STACK_SIZE);
        thread_set_real_time(t);
        thread_set_pinned_cpu(t, 0);
        thread_detach_and_resume(t);
@@ -371,27 +355,25 @@ static void preempt_test(void)
    printf("done with real-time preempt test, above time stamps should be 1 second apart\n");
 }

-static int join_tester(void *arg)
-{
-    long val = (long)arg;
+static int join_tester(void* arg) {
+    int val = (int)(uintptr_t)arg;

    printf("\t\tjoin tester starting\n");
    thread_sleep_relative(LK_MSEC(500));
-    printf("\t\tjoin tester exiting with result %ld\n", val);
+    printf("\t\tjoin tester exiting with result %d\n", val);

    return val;
 }

-static int join_tester_server(void *arg)
-{
+static int join_tester_server(void* arg) {
    int ret;
    status_t err;
-    thread_t *t;
+    thread_t* t;

    printf("\ttesting thread_join/thread_detach\n");

    printf("\tcreating and waiting on thread to exit with thread_join\n");
-    t = thread_create("join tester", &join_tester, (void *)1, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    t = thread_create("join tester", &join_tester, (void*)1, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
    thread_resume(t);
    ret = 99;
    printf("\tthread magic is 0x%x (should be 0x%x)\n", t->magic, THREAD_MAGIC);
@@ -400,7 +382,7 @@ static int join_tester_server(void *arg)
    printf("\tthread magic is 0x%x (should be 0)\n", t->magic);

    printf("\tcreating and waiting on thread to exit with thread_join, after thread has exited\n");
-    t = thread_create("join tester", &join_tester, (void *)2, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    t = thread_create("join tester", &join_tester, (void*)2, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
    thread_resume(t);
    thread_sleep_relative(LK_SEC(1)); // wait until thread is already dead
    ret = 99;
@@ -410,14 +392,14 @@ static int join_tester_server(void *arg)
    printf("\tthread magic is 0x%x (should be 0)\n", t->magic);

    printf("\tcreating a thread, detaching it, let it exit on its own\n");
-    t = thread_create("join tester", &join_tester, (void *)3, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    t = thread_create("join tester", &join_tester, (void*)3, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
    thread_detach(t);
    thread_resume(t);
    thread_sleep_relative(LK_SEC(1)); // wait until the thread should be dead
    printf("\tthread magic is 0x%x (should be 0)\n", t->magic);

    printf("\tcreating a thread, detaching it after it should be dead\n");
-    t = thread_create("join tester", &join_tester, (void *)4, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    t = thread_create("join tester", &join_tester, (void*)4, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
    thread_resume(t);
    thread_sleep_relative(LK_SEC(1)); // wait until thread is already dead
    printf("\tthread magic is 0x%x (should be 0x%x)\n", t->magic, THREAD_MAGIC);
@@ -429,24 +411,22 @@ static int join_tester_server(void *arg)
    return 55;
 }

-static void join_test(void)
-{
+static void join_test(void) {
    int ret;
    status_t err;
-    thread_t *t;
+    thread_t* t;

    printf("testing thread_join/thread_detach\n");

    printf("creating thread join server thread\n");
-    t = thread_create("join tester server", &join_tester_server, (void *)1, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
+    t = thread_create("join tester server", &join_tester_server, (void*)1, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
    thread_resume(t);
    ret = 99;
    err = thread_join(t, &ret, INFINITE_TIME);
    printf("thread_join returns err %d, retval %d (should be 0 and 55)\n", err, ret);
 }

-static void spinlock_test(void)
-{
+static void spinlock_test(void) {
    spin_lock_saved_state_t state;
    spin_lock_t lock;

@@ -466,59 +446,53 @@ static void spinlock_test(void)
    printf("seems to work\n");
 }

-static void sleeper_thread_exit(enum thread_user_state_change new_state, void *arg)
-{
+static void sleeper_thread_exit(enum thread_user_state_change new_state, void* arg) {
    TRACEF("arg %p\n", arg);
 }

-static int sleeper_kill_thread(void *arg)
-{
+static int sleeper_kill_thread(void* arg) {
    thread_sleep_relative(LK_MSEC(100));

    lk_time_t t = current_time();
    status_t err = thread_sleep_etc(t + LK_SEC(5), true);
    t = (current_time() - t) / LK_MSEC(1);
-    TRACEF("thread_sleep_etc returns %d after %" PRIu64" msecs\n", err, t);
+    TRACEF("thread_sleep_etc returns %d after %" PRIu64 " msecs\n", err, t);

    return 0;
 }

-static void waiter_thread_exit(enum thread_user_state_change new_state, void *arg)
-{
+static void waiter_thread_exit(enum thread_user_state_change new_state, void* arg) {
    TRACEF("arg %p\n", arg);
 }

-static int waiter_kill_thread_infinite_wait(void *arg)
-{
-    event_t *e = (event_t *)arg;
+static int waiter_kill_thread_infinite_wait(void* arg) {
+    event_t* e = (event_t*)arg;

    thread_sleep_relative(LK_MSEC(100));

    lk_time_t t = current_time();
    status_t err = event_wait_deadline(e, INFINITE_TIME, true);
    t = (current_time() - t) / LK_MSEC(1);
-    TRACEF("event_wait_deadline returns %d after %" PRIu64" msecs\n", err, t);
+    TRACEF("event_wait_deadline returns %d after %" PRIu64 " msecs\n", err, t);

    return 0;
 }

-static int waiter_kill_thread(void *arg)
-{
-    event_t *e = (event_t *)arg;
+static int waiter_kill_thread(void* arg) {
+    event_t* e = (event_t*)arg;

    thread_sleep_relative(LK_MSEC(100));

    lk_time_t t = current_time();
-    status_t err = event_wait_deadline (e, t + LK_SEC(5), true);
+    status_t err = event_wait_deadline(e, t + LK_SEC(5), true);
    t = (current_time() - t) / LK_MSEC(1);
-    TRACEF("event_wait_deadline with deadline returns %d after %" PRIu64" msecs\n", err, t);
+    TRACEF("event_wait_deadline with deadline returns %d after %" PRIu64 " msecs\n", err, t);

    return 0;
 }

-static void kill_tests(void)
-{
-    thread_t *t;
+static void kill_tests(void) {
+    thread_t* t;

    printf("starting sleeper thread, then killing it while it sleeps.\n");
    t = thread_create("sleeper", sleeper_kill_thread, 0, LOW_PRIORITY, DEFAULT_STACK_SIZE);
@@ -590,8 +564,7 @@ static void kill_tests(void)
    event_destroy(&e);
 }

-int thread_tests(void)
-{
+int thread_tests(void) {
    kill_tests();

    mutex_test();
@@ -610,23 +583,21 @@ int thread_tests(void)
    return 0;
 }

-static int spinner_thread(void *arg)
-{
+static int spinner_thread(void* arg) {
    for (;;)
        ;

    return 0;
 }

-int spinner(int argc, const cmd_args *argv)
-{
+int spinner(int argc, const cmd_args* argv) {
    if (argc < 2) {
        printf("not enough args\n");
        printf("usage: %s <priority> <rt>\n", argv[0].str);
        return -1;
    }

-    thread_t *t = thread_create("spinner", spinner_thread, NULL, argv[1].u, DEFAULT_STACK_SIZE);
+    thread_t* t = thread_create("spinner", spinner_thread, NULL, (int)argv[1].u, DEFAULT_STACK_SIZE);
    if (!t)
        return MX_ERR_NO_MEMORY;

@@ -1,126 +0,0 @@
-// Copyright 2017 The Fuchsia Authors
-//
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file or at
-// https://opensource.org/licenses/MIT
-
-#include "tests.h"
-
-#include <stdio.h>
-#include <err.h>
-#include <inttypes.h>
-#include <kernel/timer.h>
-#include <kernel/event.h>
-#include <kernel/thread.h>
-#include <platform.h>
-
-static enum handler_return timer_cb(struct timer* timer, lk_time_t now, void* arg)
-{
-    event_t* event = (event_t*)arg;
-    event_signal(event, false);
-
-    return INT_RESCHEDULE;
-}
-
-static int timer_do_one_thread(void* arg)
-{
-    event_t event;
-    timer_t timer;
-
-    event_init(&event, false, 0);
-    timer_init(&timer);
-
-    timer_set(&timer, current_time() + LK_MSEC(10), 0, timer_cb, &event);
-    event_wait(&event);
-
-    printf("got timer on cpu %u\n", arch_curr_cpu_num());
-
-    event_destroy(&event);
-
-    return 0;
-}
-
-static void timer_test_all_cpus(void)
-{
-    thread_t *timer_threads[SMP_MAX_CPUS];
-    uint max = arch_max_num_cpus();
-
-    uint i;
-    for (i = 0; i < max; i++) {
-        char name[16];
-        snprintf(name, sizeof(name), "timer %u\n", i);
-
-        timer_threads[i] = thread_create_etc(
-                NULL, name, timer_do_one_thread, NULL,
-                DEFAULT_PRIORITY, NULL, NULL, DEFAULT_STACK_SIZE, NULL);
-        if (timer_threads[i] == NULL) {
-            printf("failed to create thread for cpu %d\n", i);
-            return;
-        }
-        thread_set_pinned_cpu(timer_threads[i], i);
-        thread_resume(timer_threads[i]);
-    }
-    uint joined = 0;
-    for (i = 0; i < max; i++) {
-        if (thread_join(timer_threads[i], NULL, LK_SEC(1)) == 0) {
-            joined += 1;
-        }
-    }
-    printf("%u threads created, %u threads joined\n", max, joined);
-}
-
-static int cb2_timer_count = 0;
-
-static enum handler_return timer_cb2(struct timer* timer, lk_time_t now, void* arg)
-{
-    atomic_add(&cb2_timer_count, 1);
-    return INT_RESCHEDULE;
-}
-
-static void timer_test_coalescing(void)
-{
-    lk_time_t when = current_time() + LK_MSEC(1);
-    lk_time_t off = LK_USEC(10);
-    lk_time_t slack = 2u * off;
-
-    const lk_time_t deadline[] = {
-        when + (6u * off),          // non-coalesced, adjustment = 0
-        when,                       // non-coalesced, adjustment = 0
-        when - off,                 // coalesced with [1], adjustment = 10u
-        when - (3u * off),          // non-coalesced, adjustment = 0
-        when + off,                 // coalesced with [1], adjustment = -10u
-        when + (3u * off),          // non-coalesced, adjustment = 0
-        when + (5u * off),          // coalesced with [0], adjustment = 10u
-        when - (3u * off),          // non-coalesced, same as [3], adjustment = 0
-    } ;
-
-    const int64_t expected_adj[] = { 0, 0, LK_USEC(10), 0, -LK_USEC(10), 0, LK_USEC(10), 0 };
-
-    timer_t timer[countof(deadline)];
-
-    printf("       orig         new       adjustment\n");
-    for (int ix = 0; ix != countof(deadline); ++ix) {
-        timer_init(&timer[ix]);
-        lk_time_t dl = deadline[ix];
-        timer_set(&timer[ix], dl, slack, timer_cb2, NULL);
-        printf("[%d] %" PRIu64 "  -> %" PRIu64 ", %" PRIi64 "\n",
-            ix, dl, timer[ix].scheduled_time, timer[ix].slack);
-
-        if (timer[ix].slack != expected_adj[ix]) {
-            printf("unexpected adjustment! expected %" PRIi64 "\n", expected_adj[ix]);
-        }
-    }
-
-    // Wait for the timers to fire.
-    while(atomic_load(&cb2_timer_count) != countof(timer)) {
-        thread_sleep(when + LK_MSEC(5));
-    }
-
-    atomic_store(&cb2_timer_count, 0u);
-}
-
-void timer_tests(void)
-{
-    timer_test_coalescing();
-    timer_test_all_cpus();
-}
@@ -0,0 +1,197 @@
+// Copyright 2017 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+
+#include "tests.h"
+
+#include <err.h>
+#include <inttypes.h>
+#include <malloc.h>
+#include <platform.h>
+#include <stdio.h>
+
+#include <kernel/event.h>
+#include <kernel/thread.h>
+#include <kernel/timer.h>
+
+static enum handler_return timer_cb(struct timer* timer, lk_time_t now, void* arg) {
+    event_t* event = (event_t*)arg;
+    event_signal(event, false);
+
+    return INT_RESCHEDULE;
+}
+
+static int timer_do_one_thread(void* arg) {
+    event_t event;
+    timer_t timer;
+
+    event_init(&event, false, 0);
+    timer_init(&timer);
+
+    timer_set(&timer, current_time() + LK_MSEC(10), TIMER_SLACK_CENTER, 0, timer_cb, &event);
+    event_wait(&event);
+
+    printf("got timer on cpu %u\n", arch_curr_cpu_num());
+
+    event_destroy(&event);
+
+    return 0;
+}
+
+static void timer_test_all_cpus(void) {
+    thread_t* timer_threads[SMP_MAX_CPUS];
+    uint max = arch_max_num_cpus();
+
+    uint i;
+    for (i = 0; i < max; i++) {
+        char name[16];
+        snprintf(name, sizeof(name), "timer %u\n", i);
+
+        timer_threads[i] = thread_create_etc(
+            NULL, name, timer_do_one_thread, NULL,
+            DEFAULT_PRIORITY, NULL, NULL, DEFAULT_STACK_SIZE, NULL);
+        if (timer_threads[i] == NULL) {
+            printf("failed to create thread for cpu %d\n", i);
+            return;
+        }
+        thread_set_pinned_cpu(timer_threads[i], i);
+        thread_resume(timer_threads[i]);
+    }
+    uint joined = 0;
+    for (i = 0; i < max; i++) {
+        if (thread_join(timer_threads[i], NULL, LK_SEC(1)) == 0) {
+            joined += 1;
+        }
+    }
+    printf("%u threads created, %u threads joined\n", max, joined);
+}
+
+static enum handler_return timer_cb2(struct timer* timer, lk_time_t now, void* arg) {
+    int* timer_count = (int*)arg;
+    atomic_add(timer_count, 1);
+    return INT_RESCHEDULE;
+}
+
+static void timer_test_coalescing(enum slack_mode mode, uint64_t slack,
+                                  const lk_time_t* deadline, const int64_t* expected_adj, int count) {
+    printf("testing coalsecing mode %d\n", mode);
+
+    int timer_count = 0;
+
+    timer_t* timer = (timer_t*)malloc(sizeof(timer_t) * count);
+
+    printf("       orig         new       adjustment\n");
+    for (int ix = 0; ix != count; ++ix) {
+        timer_init(&timer[ix]);
+        lk_time_t dl = deadline[ix];
+        timer_set(&timer[ix], dl, mode, slack, timer_cb2, &timer_count);
+        printf("[%d] %" PRIu64 "  -> %" PRIu64 ", %" PRIi64 "\n",
+               ix, dl, timer[ix].scheduled_time, timer[ix].slack);
+
+        if (timer[ix].slack != expected_adj[ix]) {
+            printf("\n!! unexpected adjustment! expected %" PRIi64 "\n", expected_adj[ix]);
+        }
+    }
+
+    // Wait for the timers to fire.
+    while (atomic_load(&timer_count) != count) {
+        thread_sleep(current_time() + LK_MSEC(5));
+    }
+
+    free(timer);
+}
+
+static void timer_test_coalescing_center(void) {
+    lk_time_t when = current_time() + LK_MSEC(1);
+    lk_time_t off = LK_USEC(10);
+    lk_time_t slack = 2u * off;
+
+    const lk_time_t deadline[] = {
+        when + (6u * off), // non-coalesced, adjustment = 0
+        when,              // non-coalesced, adjustment = 0
+        when - off,        // coalesced with [1], adjustment = 10u
+        when - (3u * off), // non-coalesced, adjustment = 0
+        when + off,        // coalesced with [1], adjustment = -10u
+        when + (3u * off), // non-coalesced, adjustment = 0
+        when + (5u * off), // coalesced with [0], adjustment = 10u
+        when - (3u * off), // non-coalesced, same as [3], adjustment = 0
+    };
+
+    const int64_t expected_adj[countof(deadline)] = {
+        0, 0, LK_USEC(10), 0, -(int64_t)LK_USEC(10), 0, LK_USEC(10), 0};
+
+    timer_test_coalescing(
+        TIMER_SLACK_CENTER, slack, deadline, expected_adj, countof(deadline));
+}
+
+static void timer_test_coalescing_late(void) {
+    lk_time_t when = current_time() + LK_MSEC(1);
+    lk_time_t off = LK_USEC(10);
+    lk_time_t slack = 3u * off;
+
+    const lk_time_t deadline[] = {
+        when + off,        // non-coalesced, adjustment = 0
+        when + (2u * off), // non-coalesced, adjustment = 0
+        when - off,        // coalesced with [0], adjustment = 20u
+        when - (3u * off), // non-coalesced, adjustment = 0
+        when + (3u * off), // non-coalesced, adjustment = 0
+        when + (2u * off), // non-coalesced, same as [1]
+        when - (4u * off), // coalesced with [3], adjustment = 10u
+    };
+
+    const int64_t expected_adj[countof(deadline)] = {
+        0, 0, LK_USEC(20), 0, 0, 0, LK_USEC(10)};
+
+    timer_test_coalescing(
+        TIMER_SLACK_LATE, slack, deadline, expected_adj, countof(deadline));
+}
+
+static void timer_test_coalescing_early(void) {
+    lk_time_t when = current_time() + LK_MSEC(1);
+    lk_time_t off = LK_USEC(10);
+    lk_time_t slack = 3u * off;
+
+    const lk_time_t deadline[] = {
+        when,              // non-coalesced, adjustment = 0
+        when + (2u * off), // coalesced with [0], adjustment = -20u
+        when - off,        // non-coalesced, adjustment = 0
+        when - (3u * off), // non-coalesced, adjustment = 0
+        when + (4u * off), // non-coalesced, adjustment = 0
+        when + (5u * off), // coalesced with [4], adjustment = -10u
+        when - (2u * off), // coalesced with [3], adjustment = -10u
+    };
+
+    const int64_t expected_adj[countof(deadline)] = {
+        0, -(int64_t)LK_USEC(20), 0, 0, 0, -(int64_t)LK_USEC(10), -(int64_t)LK_USEC(10)};
+
+    timer_test_coalescing(
+        TIMER_SLACK_EARLY, slack, deadline, expected_adj, countof(deadline));
+}
+
+static void timer_far_deadline(void) {
+    event_t event;
+    timer_t timer;
+
+    event_init(&event, false, 0);
+    timer_init(&timer);
+
+    timer_set(&timer, UINT64_MAX - 5, TIMER_SLACK_CENTER, 0, timer_cb, &event);
+    status_t st = event_wait_deadline(&event, current_time() + LK_MSEC(100), false);
+    if (st != MX_ERR_TIMED_OUT) {
+        printf("error: unexpected timer fired!\n");
+    } else {
+        timer_cancel(&timer);
+    }
+
+    event_destroy(&event);
+}
+
+void timer_tests(void) {
+    timer_test_coalescing_center();
+    timer_test_coalescing_late();
+    timer_test_coalescing_early();
+    timer_test_all_cpus();
+    timer_far_deadline();
+}
@@ -185,13 +185,7 @@ static void arm64_cpu_early_init(void)
    ASSERT( (mmfr0 & ARM64_MMFR0_ASIDBITS_MASK) == ARM64_MMFR0_ASIDBITS_16);

    /* set the vector base */
-    ARM64_WRITE_SYSREG(VBAR_EL1, (uint64_t)&arm64_exception_base);
-
-    /* switch to EL1 */
-    uint64_t current_el = ARM64_READ_SYSREG(CURRENTEL) >> 2;
-    if (current_el > 1) {
-        arm64_el3_to_el1();
-    }
+    ARM64_WRITE_SYSREG(VBAR_EL1, (uint64_t)&arm64_el1_exception_base);

    /* set some control bits in sctlr */
    uint64_t sctlr = ARM64_READ_SYSREG(sctlr_el1);
@@ -282,9 +276,9 @@ void arch_init(void)
    print_cpu_info();

    uint32_t max_cpus = arch_max_num_cpus();
-    uint32_t cmdline_max_cpus = cmdline_get_uint32("smp.maxcpus", max_cpus);
+    uint32_t cmdline_max_cpus = cmdline_get_uint32("kernel.smp.maxcpus", max_cpus);
    if (cmdline_max_cpus > max_cpus || cmdline_max_cpus <= 0) {
-        printf("invalid smp.maxcpus value, defaulting to %u\n", max_cpus);
+        printf("invalid kernel.smp.maxcpus value, defaulting to %u\n", max_cpus);
        cmdline_max_cpus = max_cpus;
    }

@@ -8,9 +8,14 @@
 #include <asm.h>
 #include <arch/asm_macros.h>

+#define HCR_EL2_RW  (1 << 31)
+#define SCR_EL3_HCE (1 << 1)
+#define SCR_EL3_RW  (1 << 10)
+
 /* void arm64_context_switch(vaddr_t *old_sp, vaddr_t new_sp); */
 FUNCTION(arm64_context_switch)
    /* save old frame */
+    /* This layout should match struct context_switch_frame */
    push_regs x29, x30
    push_regs x27, x28
    push_regs x25, x26
@@ -42,38 +47,6 @@ FUNCTION(arm64_context_switch)
    ret
 END_FUNCTION(arm64_context_switch)

-FUNCTION(arm64_el3_to_el1)
-    /* set EL2 to 64bit */
-    mrs x0, scr_el3
-    orr x0, x0, #(1<<10)
-    msr scr_el3, x0
-
-    /* set EL1 to 64bit */
-    mov x0, #(1<<31)
-    msr hcr_el2, x0
-
-    /* disable EL2 coprocessor traps */
-    mov x0, #0x33ff
-    msr cptr_el2, x0
-
-    /* disable EL1 FPU traps */
-    mov x0, #(0b11<<20)
-    msr cpacr_el1, x0
-
-    /* set up the EL1 bounce interrupt */
-    mov x0, sp
-    msr sp_el1, x0
-
-    adr x0, .Ltarget
-    msr elr_el3, x0
-
-    mov x0, #((0b1111 << 6) | (0b0101)) /* EL1h runlevel */
-    msr spsr_el3, x0
-    isb
-
-    eret
-END_FUNCTION(arm64_el3_to_el1)
-
 FUNCTION(arm64_elX_to_el1)
    mrs x9, CurrentEL

@@ -86,13 +59,12 @@ FUNCTION(arm64_elX_to_el1)
    cmp x9, #(0b10 << 2)
    beq .inEL2

-
-    /* set EL2 to 64bit */
+    /* set EL2 to 64bit and enable HVC instruction */
    mrs x9, scr_el3
-    orr x9, x9, #(1<<10)
+    orr x9, x9, #SCR_EL3_HCE
+    orr x9, x9, #SCR_EL3_RW
    msr scr_el3, x9

-
    adr x9, .Ltarget
    msr elr_el3, x9

@@ -101,20 +73,33 @@ FUNCTION(arm64_elX_to_el1)
    b   .confEL1

 .inEL2:
+    /* Set the vector base for EL2 */
+    adr_global x9, arm64_el2_exception_base
+    msr vbar_el2, x9
+
+    /* Ensure EL1 timers are properly configured, disable EL2 trapping of
+        EL1 access to timer control registers.  Also clear virtual offset.
+    */
+    mrs x9, cnthctl_el2
+    orr x9, x9, #3
+    msr cnthctl_el2, x9
+    msr cntvoff_el2, xzr
+
+    /* clear out stage 2 translations */
+    msr vttbr_el2, xzr
+
    adr x9, .Ltarget
    msr elr_el2, x9
    mov x9, #((0b1111 << 6) | (0b0101)) /* EL1h runlevel */
    msr spsr_el2, x9

-
-
 .confEL1:
    /* disable EL2 coprocessor traps */
    mov x9, #0x33ff
    msr cptr_el2, x9

    /* set EL1 to 64bit */
-    mov x9, #(1<<31)
+    mov x9, #HCR_EL2_RW
    msr hcr_el2, x9

    /* disable EL1 FPU traps */
@@ -154,5 +139,4 @@ FUNCTION(arm64_get_secondary_sp)
    ldr     x0, [x11, #8]
    add     x1, x11, #32
    ret
-
 END_FUNCTION(arm64_get_secondary_sp)
@@ -12,6 +12,10 @@
 #include <kernel/thread.h>
 #include <magenta/syscalls/debug.h>

+// Only the NZCV flags (bits 31 to 28 respectively) of the CPSR are
+// readable and writable by userland on ARM64.
+static uint32_t kUserVisibleFlags = 0xf0000000;
+
 uint arch_num_regsets(void)
 {
    return 1; // TODO(dje): Just the general regs for now.
@@ -45,7 +49,7 @@ static status_t arch_get_general_regs(struct thread *thread, mx_arm64_general_re
    out->lr = in->lr;
    out->sp = in->usp;
    out->pc = in->elr;
-    out->cpsr = in->spsr;
+    out->cpsr = in->spsr & kUserVisibleFlags;

    return MX_OK;
 }
@@ -75,7 +79,8 @@ static status_t arch_set_general_regs(struct thread *thread, const mx_arm64_gene
    out->lr = in->lr;
    out->usp = in->sp;
    out->elr = in->pc;
-    out->spsr = in->cpsr;
+    out->spsr = (out->spsr & ~kUserVisibleFlags)
+        | (in->cpsr & kUserVisibleFlags);

    return MX_OK;
 }
@@ -95,7 +100,7 @@ status_t arch_get_regset(struct thread *thread, uint regset, void *regs, uint32_

 // The caller is responsible for making sure the thread is in an exception
 // or is suspended, and stays so.
-status_t arch_set_regset(struct thread *thread, uint regset, const void *regs, uint32_t buf_size, bool priv)
+status_t arch_set_regset(struct thread *thread, uint regset, const void *regs, uint32_t buf_size)
 {
    switch (regset)
    {
@@ -0,0 +1,187 @@
+// Copyright 2017 The Fuchsia Authors
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+
+#include <stdlib.h>
+#include <arch/ops.h>
+#include <arch/efi.h>
+#include <kernel/vm.h>
+#include <inttypes.h>
+#include <string.h>
+
+efi_system_table_t *sys_table = NULL;
+
+uint64_t efi_boot(void* handle, efi_system_table_t *systable, paddr_t image_addr) __EXTERNALLY_VISIBLE;
+
+static uint32_t efi_utf16_ascii_len(const uint16_t *src, int n) {
+    uint32_t count = 0;
+    uint16_t c;
+    while (n--) {
+        c = *src++;
+        if (c < 0x80)
+            count++;
+    }
+    return count;
+}
+
+static char *efi_utf16_to_ascii(char *dst, const uint16_t *src, int n)
+{
+    uint32_t c;
+
+    while (n--) {
+        c = *src++;
+        if (c < 0x80) {
+            *dst++ = (char)c;
+            continue;
+        }
+        if (c < 0x800) {
+            *dst++ = (char)(0xc0 + (c >> 6));
+            goto t1;
+        }
+        if (c < 0x10000) {
+            *dst++ = (char)(0xe0 + (c >> 12));
+            goto t2;
+        }
+        *dst++ = (char)(0xf0 + (c >> 18));
+        *dst++ = (char)(0x80 + ((c >> 12) & 0x3f));
+    t2:
+        *dst++ = (char)(0x80 + ((c >> 6) & 0x3f));
+    t1:
+        *dst++ = (char)(0x80 + (c & 0x3f));
+    }
+
+    return dst;
+}
+
+static void efi_print(const char *str)
+{
+    int i;
+    struct efi_simple_text_output_protocol *out;
+    if (sys_table) {
+        out = (struct efi_simple_text_output_protocol *)sys_table->con_out;
+
+        for (i = 0; str[i]; i++) {
+            efi_char16_t ch[2] = { 0 };
+
+            ch[0] = str[i];
+            if (str[i] == '\n') {
+                efi_char16_t nl[2] = { '\r', 0 };
+                out->output_string(out, nl);
+            }
+            out->output_string(out, ch);
+        }
+    }
+}
+#if 1
+#define efi_printf(args...)                 \
+    do {                                    \
+        char buff[256];                     \
+        snprintf(buff,sizeof(buff),args);   \
+        efi_print(buff);                    \
+    } while(0);
+#else
+#define efi_printf(args...)
+#endif
+
+extern uint64_t _start;
+extern uint64_t _end;
+
+uint64_t efi_boot(void* handle, efi_system_table_t *systable, paddr_t image_addr) {
+
+    efi_status_t status;
+    efi_loaded_image_t *image;
+    efi_guid_t loaded_image_proto = LOADED_IMAGE_PROTOCOL_GUID;
+
+    sys_table = systable;
+
+    efi_printf("Booting Magenta from EFI loader...\n");
+
+    status = systable->boottime->handle_protocol(handle,
+                    &loaded_image_proto, (void **)&image);
+    if (status != EFI_SUCCESS) {
+        efi_printf("Failed to get loaded image protocol\n");
+        return 0;
+    }
+
+    // Allocate space for new kernel location (+bss)
+    uint64_t kern_pages = (uint64_t)&_end - (uint64_t)&_start;
+    kern_pages = ROUNDUP(kern_pages, EFI_ALLOC_ALIGN) / EFI_PAGE_SIZE;
+    efi_physical_addr_t target_addr = MEMBASE + KERNEL_LOAD_OFFSET;
+    status = systable->boottime->allocate_pages( EFI_ALLOCATE_ADDRESS,
+                                                 EFI_LOADER_DATA,
+                                                 kern_pages,
+                                                 &target_addr);
+    if (status != EFI_SUCCESS) {
+        efi_printf("Failed to allocate space for kernel\n");
+        return 0;
+    }
+
+    // Copy kernel to new location
+    memcpy((void*)target_addr,(void*)image_addr,kern_pages*EFI_PAGE_SIZE);
+
+
+    efi_magenta_hdr_t *mag_hdr;
+
+    uint32_t cmd_line_len = efi_utf16_ascii_len((const uint16_t*)image->load_options,image->load_options_size/2) + 1;
+
+    status = systable->boottime->allocate_pool(EFI_LOADER_DATA, sizeof(*mag_hdr) + cmd_line_len,
+                                                                (void **)&mag_hdr);
+    if (status != EFI_SUCCESS) {
+        efi_printf("Failed to allocate space for magenta boot args\n");
+        return 0;
+    }
+
+    efi_printf("Magenta boot args address= %p\n",(void*)mag_hdr);
+
+    mag_hdr->magic = EFI_MAGENTA_MAGIC;
+    mag_hdr->cmd_line_len = cmd_line_len;
+    efi_utf16_to_ascii(mag_hdr->cmd_line, (const uint16_t*)image->load_options, image->load_options_size/2);
+    mag_hdr->cmd_line[cmd_line_len-1]=0;
+
+    efi_printf("Magenta cmdline args = %s\n",mag_hdr->cmd_line);
+    const char token[] = "initrd=";
+    char* pos;
+    uint64_t initrd_start_phys=0;
+    uint64_t initrd_size=0;
+    pos = strstr(mag_hdr->cmd_line,token);
+    if (pos) {
+        pos = pos + strlen(token);
+        initrd_start_phys = strtoll(pos,&pos,16);
+        pos++;
+        initrd_size = strtoll(pos,&pos,16);
+    }
+
+    if (initrd_start_phys && initrd_size) {
+        uint64_t ramdisk_pages = ROUNDUP_PAGE_SIZE(initrd_size) / PAGE_SIZE;
+        /* TODO - figure out how to pull this from boot image header */
+        efi_physical_addr_t ramdisk_target_addr = 0x07c00000;
+
+        status = systable->boottime->allocate_pages( EFI_ALLOCATE_ADDRESS,
+                                                     EFI_LOADER_DATA,
+                                                     ramdisk_pages,
+                                                     &ramdisk_target_addr);
+        if (status != EFI_SUCCESS) {
+            efi_printf("Failed to allocate space for ramdisk\n");
+            return 0;
+        }
+        mag_hdr->ramdisk_base_phys = (uint64_t)ramdisk_target_addr;
+        mag_hdr->ramdisk_size = (uint64_t)ROUNDUP_PAGE_SIZE(initrd_size);
+
+        // Copy kernel to new location
+        memcpy((void*)ramdisk_target_addr,
+               (void*)initrd_start_phys,initrd_size);
+
+        arch_sync_cache_range((addr_t)ramdisk_target_addr,initrd_size);
+        efi_printf("initrd found and flushed from cache...\n");
+    } else {
+        efi_printf("initrd not found!!!!!\n");
+        return 0;
+    }
+
+    // sync cache (we jumped here with mmu on w/ identity and cache on)
+    arch_sync_cache_range((addr_t)target_addr, kern_pages*EFI_PAGE_SIZE);
+    arch_sync_cache_range((addr_t)mag_hdr, sizeof(*mag_hdr) + cmd_line_len);
+
+    return (uint64_t)mag_hdr;
+}
@@ -11,7 +11,7 @@
 #include <arch/arm64.h>
 #include <arch/arm64/exceptions.h>

-.section .text.boot.vectab,"ax",@progbits
+.section .text.boot.vectab.el1,"ax",@progbits
 .align 12

 #define DW_REG_lr   30
@@ -148,11 +148,7 @@ mark_lr_sp_inaccessible
    .cfi_startproc simple
    .cfi_signal_frame
    // The return address is in elr_el1, not lr.
-#ifndef __clang__
-    // TODO(dje): Add clang back when the upstream LLVM bug
-    // https://bugs.llvm.org/show_bug.cgi?id=33953 is fixed.
    .cfi_return_column elr1
-#endif
    .cfi_def_cfa sp, 0
 .endm

@@ -252,102 +248,103 @@ mark_lr_sp_inaccessible
    b  arm64_exc_shared_restore_long
 .endm

-FUNCTION_LABEL(arm64_exception_base)
+FUNCTION_LABEL(arm64_el1_exception_base)

 /* exceptions from current EL, using SP0 */
-LOCAL_FUNCTION_LABEL(arm64_sync_exc_current_el_SP0)
+.org 0x000
+LOCAL_FUNCTION_LABEL(arm64_el1_sync_exc_current_el_SP0)
    invalid_exception 0
-END_FUNCTION(arm64_sync_exc_current_el_SP0)
+END_FUNCTION(arm64_el1_sync_exc_current_el_SP0)

 .org 0x080
-LOCAL_FUNCTION_LABEL(arm64_irq_current_el_SP0)
+LOCAL_FUNCTION_LABEL(arm64_el1_irq_current_el_SP0)
    invalid_exception 1
-END_FUNCTION(arm64_irq_current_el_SP0)
+END_FUNCTION(arm64_el1_irq_current_el_SP0)

 .org 0x100
-LOCAL_FUNCTION_LABEL(arm64_fiq_current_el_SP0)
+LOCAL_FUNCTION_LABEL(arm64_el1_fiq_current_el_SP0)
    invalid_exception 2
-END_FUNCTION(arm64_fiq_current_el_SP0)
+END_FUNCTION(arm64_el1_fiq_current_el_SP0)

 .org 0x180
-LOCAL_FUNCTION_LABEL(arm64_err_exc_current_el_SP0)
+LOCAL_FUNCTION_LABEL(arm64_el1_err_exc_current_el_SP0)
    invalid_exception 3
-END_FUNCTION(arm64_err_exc_current_el_SP0)
+END_FUNCTION(arm64_el1_err_exc_current_el_SP0)

 /* exceptions from current EL, using SPx */
 .org 0x200
-LOCAL_FUNCTION_LABEL(arm64_sync_exc_current_el_SPx)
+LOCAL_FUNCTION_LABEL(arm64_el1_sync_exc_current_el_SPx)
    sync_exception #0 /* same EL, arm64 */
-END_FUNCTION(arm64_sync_exc_current_el_SPx)
+END_FUNCTION(arm64_el1_sync_exc_current_el_SPx)

 .org 0x280
-LOCAL_FUNCTION_LABEL(arm64_irq_current_el_SPx)
+LOCAL_FUNCTION_LABEL(arm64_el1_irq_current_el_SPx)
    irq_exception #0 /* same EL, arm64 */
-END_FUNCTION(arm64_irq_current_el_SPx)
+END_FUNCTION(arm64_el1_irq_current_el_SPx)

 .org 0x300
-LOCAL_FUNCTION_LABEL(arm64_fiq_current_el_SPx)
+LOCAL_FUNCTION_LABEL(arm64_el1_fiq_current_el_SPx)
    start_isr_func
    regsave_short
    mov x0, sp
    bl  platform_fiq
    b  arm64_exc_shared_restore_short
-END_FUNCTION(arm64_fiq_current_el_SPx)
+END_FUNCTION(arm64_el1_fiq_current_el_SPx)

 .org 0x380
-LOCAL_FUNCTION_LABEL(arm64_err_exc_current_el_SPx)
+LOCAL_FUNCTION_LABEL(arm64_el1_err_exc_current_el_SPx)
    invalid_exception 0x13
-END_FUNCTION(arm64_err_exc_current_el_SPx)
+END_FUNCTION(arm64_el1_err_exc_current_el_SPx)

 /* exceptions from lower EL, running arm64 */
 .org 0x400
-LOCAL_FUNCTION_LABEL(arm64_sync_exc_lower_el_64)
+LOCAL_FUNCTION_LABEL(arm64_el1_sync_exc_lower_el_64)
    sync_exception #(ARM64_EXCEPTION_FLAG_LOWER_EL)
-END_FUNCTION(arm64_sync_exc_lower_el_64)
+END_FUNCTION(arm64_el1_sync_exc_lower_el_64)

 .org 0x480
-LOCAL_FUNCTION_LABEL(arm64_irq_lower_el_64)
+LOCAL_FUNCTION_LABEL(arm64_el1_irq_lower_el_64)
    irq_exception #(ARM64_EXCEPTION_FLAG_LOWER_EL)
-END_FUNCTION(arm64_irq_lower_el_64)
+END_FUNCTION(arm64_el1_irq_lower_el_64)

 .org 0x500
-LOCAL_FUNCTION_LABEL(arm64_fiq_lower_el_64)
+LOCAL_FUNCTION_LABEL(arm64_el1_fiq_lower_el_64)
    start_isr_func
    regsave_short
    mov x0, sp
    bl  platform_fiq
    b  arm64_exc_shared_restore_short
-END_FUNCTION(arm64_fiq_lower_el_64)
+END_FUNCTION(arm64_el1_fiq_lower_el_64)

 .org 0x580
-LOCAL_FUNCTION_LABEL(arm64_err_exc_lower_el_64)
+LOCAL_FUNCTION_LABEL(arm64_el1_err_exc_lower_el_64)
    invalid_exception 0x23
-END_FUNCTION(arm64_err_exc_lower_el_64)
+END_FUNCTION(arm64_el1_err_exc_lower_el_64)

 /* exceptions from lower EL, running arm32 */
 .org 0x600
-LOCAL_FUNCTION_LABEL(arm64_sync_exc_lower_el_32)
+LOCAL_FUNCTION_LABEL(arm64_el1_sync_exc_lower_el_32)
    sync_exception #(ARM64_EXCEPTION_FLAG_LOWER_EL|ARM64_EXCEPTION_FLAG_ARM32)
-END_FUNCTION(arm64_sync_exc_lower_el_32)
+END_FUNCTION(arm64_el1_sync_exc_lower_el_32)

 .org 0x680
-LOCAL_FUNCTION_LABEL(arm64_irq_lower_el_32)
+LOCAL_FUNCTION_LABEL(arm64_el1_irq_lower_el_32)
    irq_exception #(ARM64_EXCEPTION_FLAG_LOWER_EL|ARM64_EXCEPTION_FLAG_ARM32)
-END_FUNCTION(arm64_irq_lower_el_32)
+END_FUNCTION(arm64_el1_irq_lower_el_32)

 .org 0x700
-LOCAL_FUNCTION_LABEL(arm64_fiq_lower_el_32)
+LOCAL_FUNCTION_LABEL(arm64_el1_fiq_lower_el_32)
    start_isr_func
    regsave_short
    mov x0, sp
    bl  platform_fiq
    b  arm64_exc_shared_restore_short
-END_FUNCTION(arm64_fiq_lower_el_32)
+END_FUNCTION(arm64_el1_fiq_lower_el_32)

 .org 0x780
-LOCAL_FUNCTION_LABEL(arm64_err_exc_lower_el_32)
+LOCAL_FUNCTION_LABEL(arm64_el1_err_exc_lower_el_32)
    invalid_exception 0x33
-END_FUNCTION(arm64_err_exc_lower_el_32)
+END_FUNCTION(arm64_el1_err_exc_lower_el_32)

 /* If an IRQ happened in userspace, and either the thread was signaled or
   needs to be rescheduled, then we end up here after arm64_irq returns.
@@ -13,16 +13,15 @@
 #include <arch/arch_ops.h>
 #include <arch/arm64.h>
 #include <arch/arm64/exceptions.h>
+#include <arch/exception.h>
+#include <arch/user_copy.h>
 #include <kernel/thread.h>
 #include <kernel/stats.h>
 #include <kernel/vm.h>
-#include <kernel/vm/fault.h>
 #include <platform.h>
+#include <vm/fault.h>

-#if WITH_LIB_MAGENTA
-#include <lib/user_copy.h>
-#include <magenta/exception.h>
-#endif
+#include <magenta/syscalls/exception.h>

 #define LOCAL_TRACE 0

@@ -50,9 +49,9 @@ __WEAK void arm64_syscall(struct arm64_iframe_long *iframe, bool is_64bit, uint6
    panic("unhandled syscall vector\n");
 }

-#if WITH_LIB_MAGENTA
-
-static status_t call_magenta_data_fault_exception_handler(mx_excp_type_t type, struct arm64_iframe_long *iframe, uint32_t esr, uint64_t far)
+static status_t try_dispatch_user_data_fault_exception(
+    mx_excp_type_t type, struct arm64_iframe_long *iframe,
+    uint32_t esr, uint64_t far)
 {
    thread_t *thread = get_current_thread();
    arch_exception_context_t context = {};
@@ -64,19 +63,18 @@ static status_t call_magenta_data_fault_exception_handler(mx_excp_type_t type, s
    arch_enable_ints();
    DEBUG_ASSERT(thread->arch.suspended_general_regs == nullptr);
    thread->arch.suspended_general_regs = iframe;
-    status_t status = magenta_exception_handler(type, &context);
+    status_t status = dispatch_user_exception(type, &context);
    thread->arch.suspended_general_regs = nullptr;
    arch_disable_ints();
    return status;
 }

-static status_t call_magenta_exception_handler(mx_excp_type_t type, struct arm64_iframe_long *iframe, uint32_t esr)
+static status_t try_dispatch_user_exception(
+    mx_excp_type_t type, struct arm64_iframe_long *iframe, uint32_t esr)
 {
-    return call_magenta_data_fault_exception_handler(type, iframe, esr, 0);
+    return try_dispatch_user_data_fault_exception(type, iframe, esr, 0);
 }

-#endif
-
 __NO_RETURN static void exception_die(struct arm64_iframe_long *iframe, uint32_t esr)
 {
    platform_panic_start();
@@ -101,9 +99,7 @@ static void arm64_unknown_handler(struct arm64_iframe_long *iframe, uint excepti
        printf("unknown exception in kernel: PC at %#" PRIx64 "\n", iframe->elr);
        exception_die(iframe, esr);
    }
-#if WITH_LIB_MAGENTA
-    call_magenta_exception_handler (MX_EXCP_UNDEFINED_INSTRUCTION, iframe, esr);
-#endif
+    try_dispatch_user_exception(MX_EXCP_UNDEFINED_INSTRUCTION, iframe, esr);
 }

 static void arm64_brk_handler(struct arm64_iframe_long *iframe, uint exception_flags,
@@ -114,9 +110,7 @@ static void arm64_brk_handler(struct arm64_iframe_long *iframe, uint exception_f
        printf("BRK in kernel: PC at %#" PRIx64 "\n", iframe->elr);
        exception_die(iframe, esr);
    }
-#if WITH_LIB_MAGENTA
-    call_magenta_exception_handler (MX_EXCP_SW_BREAKPOINT, iframe, esr);
-#endif
+    try_dispatch_user_exception(MX_EXCP_SW_BREAKPOINT, iframe, esr);
 }

 static void arm64_fpu_handler(struct arm64_iframe_long *iframe, uint exception_flags,
@@ -172,14 +166,13 @@ static void arm64_instruction_abort_handler(struct arm64_iframe_long *iframe, ui
    if (err >= 0)
        return;

-#if WITH_LIB_MAGENTA
-    /* if this is from user space, let magenta get a shot at it */
+    // If this is from user space, let the user exception handler
+    // get a shot at it.
    if (is_user) {
        CPU_STATS_INC(exceptions);
-        if (call_magenta_data_fault_exception_handler (MX_EXCP_FATAL_PAGE_FAULT, iframe, esr, far) == MX_OK)
+        if (try_dispatch_user_data_fault_exception(MX_EXCP_FATAL_PAGE_FAULT, iframe, esr, far) == MX_OK)
            return;
    }
-#endif

    printf("instruction abort: PC at %#" PRIx64 ", is_user %d, FAR %" PRIx64 "\n",
           iframe->elr, is_user, far);
@@ -226,18 +219,17 @@ static void arm64_data_abort_handler(struct arm64_iframe_long *iframe, uint exce
        return;
    }

-#if WITH_LIB_MAGENTA
-    /* if this is from user space, let magenta get a shot at it */
+    // If this is from user space, let the user exception handler
+    // get a shot at it.
    if (is_user) {
        CPU_STATS_INC(exceptions);
        mx_excp_type_t excp_type = MX_EXCP_FATAL_PAGE_FAULT;
        if (unlikely(dfsc == DFSC_ALIGNMENT_FAULT)) {
            excp_type = MX_EXCP_UNALIGNED_ACCESS;
        }
-        if (call_magenta_data_fault_exception_handler (excp_type, iframe, esr, far) == MX_OK)
+        if (try_dispatch_user_data_fault_exception(excp_type, iframe, esr, far) == MX_OK)
            return;
    }
-#endif

    /* decode the iss */
    if (BIT(iss, 24)) { /* ISV bit */
@@ -302,11 +294,9 @@ extern "C" void arm64_sync_exception(struct arm64_iframe_long *iframe, uint exce
                printf("unhandled exception in kernel: PC at %#" PRIx64 "\n", iframe->elr);
                exception_die(iframe, esr);
            }
-#if WITH_LIB_MAGENTA
-            /* let magenta get a shot at it */
-            if (call_magenta_exception_handler (MX_EXCP_GENERAL, iframe, esr) == MX_OK)
+            /* let the user exception handler get a shot at it */
+            if (try_dispatch_user_exception(MX_EXCP_GENERAL, iframe, esr) == MX_OK)
                break;
-#endif
            printf("unhandled synchronous exception\n");
            exception_die(iframe, esr);
        }
@@ -406,7 +396,6 @@ static void arm64_thread_process_pending_signals(struct arm64_iframe_long *ifram
    thread->arch.suspended_general_regs = nullptr;
 }

-#if WITH_LIB_MAGENTA
 void arch_dump_exception_context(const arch_exception_context_t *context)
 {
    uint32_t ec = BITS_SHIFT(context->esr, 31, 26);
@@ -436,7 +425,7 @@ void arch_dump_exception_context(const arch_exception_context_t *context)
    // try to dump the user stack
    if (is_user_address(context->frame->usp)) {
        uint8_t buf[256];
-        if (copy_from_user_unsafe(buf, (void *)context->frame->usp, sizeof(buf)) == MX_OK) {
+        if (arch_copy_from_user(buf, (void *)context->frame->usp, sizeof(buf)) == MX_OK) {
            printf("bottom of user stack at 0x%lx:\n", (vaddr_t)context->frame->usp);
            hexdump_ex(buf, sizeof(buf), context->frame->usp);
        }
@@ -457,12 +446,10 @@ void arch_fill_in_exception_context(const arch_exception_context_t *arch_context
    }
 }

-status_t magenta_report_policy_exception(void)
+status_t arch_dispatch_user_policy_exception(void)
 {
    struct arm64_iframe_long frame = {};
    arch_exception_context_t context = {};
    context.frame = &frame;
-    return magenta_exception_handler(MX_EXCP_POLICY_ERROR, &context);
+    return dispatch_user_exception(MX_EXCP_POLICY_ERROR, &context);
 }
-
-#endif
@@ -18,6 +18,7 @@ _fastboot_start:
    .quad   0
    .quad   0
    .quad   0
+
    .byte   'A'
    .byte   'R'
    .byte   'M'
@@ -26,51 +27,51 @@ _fastboot_start:
    .align 3

 pe_header:
-    .ascii  "PE"
-    .short  0
+    .ascii  "PE"                            //PE Magic
+    .short  0                               //  PE Magic

    // coff header
-    .short  0xaa64
-    .short  2       // number of sections
-    .long   0
-    .long   0
-    .long   1
-    .short  section_table - optional_header
-    .short  0x206
+    .short  0xaa64                          //Machine
+    .short  2                               //NumberOfSections
+    .long   0                               //TimeDateStamp
+    .long   0                               //PointerToSymbolTable
+    .long   1                               //NumberOfSymbols
+    .short  section_table - optional_header //SizeOfOptionalHeader
+    .short  0x206                           //Characteristics

 optional_header:
-    .short  0x20B
-    .byte   0x2
-    .byte   0x14
-    .long   _end - header_end
-    .long   0
-    .long   0
-    .long   0   // should be entry point
-    .long   header_end - _start
+    .short  0x20B                           //signature
+    .byte   0x2                             //MajorLinkerVersion
+    .byte   0x14                            //MinorLinkerVersion
+    .long   _end - header_end               //sizeOfCode
+    .long   0                               //SizeOfInitializedData
+    .long   0                               //SizeofUninitializedData
+    .long   header_end - _start             //AdressOfEntryPoint
+    .long   header_end - _start             //BaseOfCode

-    .quad   0
-    .long   0x1000
-    .long   0x200
-    .short  0
-    .short  0
-    .short  0
-    .short  0
-    .short  0
-    .short  0
-    .long   0
+    .quad   0                               //ImageBase
+    .long   0x1000                          //SectionAlignment
+    .long   0x200                           //FileAlignment
+    .short  0                               //MajorOSVersion
+    .short  0                               //MinorOSVersion
+    .short  0                               //MajorImageVersion
+    .short  0                               //MinorImageVersion
+    .short  0                               //MajorSubsystemVersion
+    .short  0                               //MinorSubsystemVersion
+    .long   0                               //Win32VersionValue

-    .long   _end - _start
-    .long   header_end - _start
-    .long   0
-    .short  0xA
-    .short  0
-    .quad   0
-    .quad   0
-    .quad   0
-    .quad   0
-    .long   0
-    .long   6
-    .quad   0
+    .long   _end - _start                   //SizeOfImage
+    .long   header_end - _start             //SizeOfHeaders
+    .long   0                               //Checksum
+    .short  0xA                             //Subsystem
+    .short  0                               //DLLCharacteristics
+    .quad   0                               //SizeOfStackReserve
+    .quad   0                               //SizeOfStackCommit
+    .quad   0                               //SizeOfHeapReserve
+    .quad   0                               //SizeOfHeapCommit
+    .long   0                               //LoaderFlags
+    .long   6                               //NumberOfRvaAndSizes
+    .quad   0                               //DataDirectory[6]
    .quad   0
    .quad   0
    .quad   0
@@ -91,19 +92,19 @@ section_table:
    .short  0
    .long   0x42100040

-    .ascii  ".text"
+    .ascii  ".text"                         //Name[6] (char)
    .byte   0
    .byte   0
    .byte   0
-    .long   _end - header_end
-    .long   header_end - _start
-    .long   0
-    .long   0
-    .long   0
-    .long   0
-    .short  0
-    .short  0
-    .long   0xE0500020
+    .long   0                               //PhysicalAddress/VirtualSize
+    .long   header_end - _start             //VirtualAddress
+    .long   __data_end - header_end         //SizeOfRawData
+    .long   header_end - _start             //PointerToRawData
+    .long   0                               //PointertoRelocations
+    .long   0                               //PointerToLinenumbers
+    .short  0                               //NumberOfRelocations
+    .short  0                               //NumberOfLinenumbers
+    .long   0xE0500020                      //Characteristics
    .align 12

 header_end:
@@ -1,33 +0,0 @@
-// Copyright 2017 The Fuchsia Authors
-//
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file or at
-// https://opensource.org/licenses/MIT
-
-#include <arch/hypervisor.h>
-#include <magenta/errors.h>
-
-status_t arch_guest_create(mxtl::RefPtr<VmObject> physmem, mxtl::unique_ptr<Guest>* guest) {
-    return MX_ERR_NOT_SUPPORTED;
-}
-
-status_t arch_guest_set_trap(Guest* guest, uint32_t kind, mx_vaddr_t addr, size_t len,
-                             mxtl::RefPtr<FifoDispatcher> fifo) {
-    return MX_ERR_NOT_SUPPORTED;
-}
-
-status_t arch_vcpu_resume(Vcpu* vcpu, mx_guest_packet_t* packet) {
-    return MX_ERR_NOT_SUPPORTED;
-}
-
-status_t arch_vcpu_interrupt(Vcpu* vcpu, uint32_t interrupt) {
-    return MX_ERR_NOT_SUPPORTED;
-}
-
-status_t arch_vcpu_read_state(const Vcpu* vcpu, uint32_t kind, void* buffer, uint32_t len) {
-    return MX_ERR_NOT_SUPPORTED;
-}
-
-status_t arch_vcpu_write_state(Vcpu* vcpu, uint32_t kind, const void* buffer, uint32_t len) {
-    return MX_ERR_NOT_SUPPORTED;
-}
@@ -0,0 +1,138 @@
+// Copyright 2017 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+
+#include <arch/arm64/mmu.h>
+#include <arch/asm_macros.h>
+#include <asm.h>
+#include <magenta/errors.h>
+
+#define ESR_EL2_EC_MASK     0xfc000000
+#define ESR_EL2_ISS_MASK    0x01ffffff
+#define HVC_MAX_INDEX       1
+
+.section .text.el2,"ax",@progbits
+.align 12
+
+// EL2 functions
+LOCAL_FUNCTION(el2_set_stack)
+    mov sp, x0
+    mov x0, #MX_OK
+    eret
+END_FUNCTION(el2_set_stack)
+
+.section .text.boot.vectab.el2,"ax",@progbits
+.align 12
+
+.macro invalid_exception
+    // TODO(abdulla): Check VMID from VTTBR_EL2. ERET to host with error. If
+    // VMID was not 0, terminate guest.
+    eret
+.endm
+
+.macro sync_exception
+    mrs x10, esr_el2
+    and x10, x10, #ESR_EL2_ISS_MASK
+    cmp x10, #HVC_MAX_INDEX
+    b.ge out_of_range
+
+    lsl x10, x10, #2
+    adr x9, table
+    add x9, x9, x10
+    br x9
+
+table:
+    b el2_set_stack
+
+out_of_range:
+    mov x0, MX_ERR_OUT_OF_RANGE
+    eret
+.endm
+
+FUNCTION_LABEL(arm64_el2_exception_base)
+
+/* exceptions from current EL, using SP0 */
+.org 0x000
+LOCAL_FUNCTION(arm64_el2_sync_exc_current_el_SP0)
+    invalid_exception
+END_FUNCTION(arm64_el2_sync_exc_current_el_SP0)
+
+.org 0x080
+LOCAL_FUNCTION(arm64_el2_irq_current_el_SP0)
+    invalid_exception
+END_FUNCTION(arm64_el2_irq_current_el_SP0)
+
+.org 0x100
+LOCAL_FUNCTION(arm64_el2_fiq_current_el_SP0)
+    invalid_exception
+END_FUNCTION(arm64_el2_fiq_current_el_SP0)
+
+.org 0x180
+LOCAL_FUNCTION(arm64_el2_err_exc_current_el_SP0)
+    invalid_exception
+END_FUNCTION(arm64_el2_err_exc_current_el_SP0)
+
+/* exceptions from current EL, using SPx */
+.org 0x200
+LOCAL_FUNCTION(arm64_el2_sync_exc_current_el_SPx)
+    invalid_exception
+END_FUNCTION(arm64_el2_sync_exc_current_el_SPx)
+
+.org 0x280
+LOCAL_FUNCTION(arm64_el2_irq_current_el_SPx)
+    invalid_exception
+END_FUNCTION(arm64_el2_irq_current_el_SPx)
+
+.org 0x300
+LOCAL_FUNCTION(arm64_el2_fiq_current_el_SPx)
+    invalid_exception
+END_FUNCTION(arm64_el2_fiq_current_el_SPx)
+
+.org 0x380
+LOCAL_FUNCTION(arm64_el2_err_exc_current_el_SPx)
+    invalid_exception
+END_FUNCTION(arm64_el2_err_exc_current_el_SPx)
+
+/* exceptions from lower EL, running arm64 */
+.org 0x400
+LOCAL_FUNCTION(arm64_el2_sync_exc_lower_el_64)
+    sync_exception
+END_FUNCTION(arm64_el2_sync_exc_lower_el_64)
+
+.org 0x480
+LOCAL_FUNCTION(arm64_el2_irq_lower_el_64)
+    invalid_exception
+END_FUNCTION(arm64_el2_irq_lower_el_64)
+
+.org 0x500
+LOCAL_FUNCTION(arm64_el2_fiq_lower_el_64)
+    invalid_exception
+END_FUNCTION(arm64_el2_fiq_lower_el_64)
+
+.org 0x580
+LOCAL_FUNCTION(arm64_el2_err_exc_lower_el_64)
+    invalid_exception
+END_FUNCTION(arm64_el2_err_exc_lower_el_64)
+
+/* exceptions from lower EL, running arm32 */
+.org 0x600
+LOCAL_FUNCTION(arm64_el2_sync_exc_lower_el_32)
+    invalid_exception
+END_FUNCTION(arm64_el2_sync_exc_lower_el_32)
+
+.org 0x680
+LOCAL_FUNCTION(arm64_el2_irq_lower_el_32)
+    invalid_exception
+END_FUNCTION(arm64_el2_irq_lower_el_32)
+
+.org 0x700
+LOCAL_FUNCTION(arm64_el2_fiq_lower_el_32)
+    invalid_exception
+END_FUNCTION(arm64_el2_fiq_lower_el_32)
+
+.org 0x780
+LOCAL_FUNCTION(arm64_el2_err_exc_lower_el_32)
+    invalid_exception
+END_FUNCTION(arm64_el2_err_exc_lower_el_32)
@@ -0,0 +1,114 @@
+// Copyright 2017 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+
+#include "el2_cpu_state_priv.h"
+
+#include <fbl/auto_lock.h>
+#include <fbl/mutex.h>
+#include <vm/pmm.h>
+
+static fbl::Mutex el2_mutex;
+static size_t num_guests TA_GUARDED(el2_mutex) = 0;
+static fbl::unique_ptr<El2CpuState> el2_cpu_state TA_GUARDED(el2_mutex);
+
+static mx_status_t el2_set_stack(mx_paddr_t stack_top) {
+    register mx_status_t status asm("x0") = MX_OK;
+    __asm__ volatile("hvc #0" ::: "x0");
+    return status;
+}
+
+El2Stack::~El2Stack() {
+    if (stack_paddr_ != 0)
+        pmm_free_kpages(paddr_to_kvaddr(stack_paddr_), ARCH_DEFAULT_STACK_SIZE / PAGE_SIZE);
+}
+
+mx_status_t El2Stack::Alloc() {
+    pmm_alloc_kpages(ARCH_DEFAULT_STACK_SIZE / PAGE_SIZE, nullptr, &stack_paddr_);
+    return stack_paddr_ != 0 ? MX_OK : MX_ERR_NO_MEMORY;
+}
+
+mx_paddr_t El2Stack::Top() const {
+    return stack_paddr_ + ARCH_DEFAULT_STACK_SIZE;
+}
+
+static mx_status_t el2_on_task(void* context, uint cpu_num) {
+    auto stacks = static_cast<fbl::Array<El2Stack>*>(context);
+    El2Stack& stack = (*stacks)[cpu_num];
+
+    mx_status_t status = el2_set_stack(stack.Top());
+    if (status != MX_OK) {
+        dprintf(CRITICAL, "Failed to set EL2 stack for CPU %u\n", cpu_num);
+        return status;
+    }
+
+    return MX_OK;
+}
+
+static void el2_off_task(void* arg) {
+    mx_status_t status = el2_set_stack(0);
+    if (status != MX_OK)
+        dprintf(CRITICAL, "Failed to clear EL2 stack for CPU %u\n", arch_curr_cpu_num());
+}
+
+// static
+mx_status_t El2CpuState::Create(fbl::unique_ptr<El2CpuState>* out) {
+    fbl::AllocChecker ac;
+    fbl::unique_ptr<El2CpuState> el2_cpu_state(new (&ac) El2CpuState);
+    if (!ac.check())
+        return MX_ERR_NO_MEMORY;
+    mx_status_t status = el2_cpu_state->Init();
+    if (status != MX_OK)
+        return status;
+
+    // Allocate EL2 stack for each CPU.
+    size_t num_cpus = arch_max_num_cpus();
+    El2Stack* stacks = new (&ac) El2Stack[num_cpus];
+    if (!ac.check())
+        return MX_ERR_NO_MEMORY;
+    fbl::Array<El2Stack> el2_stacks(stacks, num_cpus);
+    for (auto& stack : el2_stacks) {
+        mx_status_t status = stack.Alloc();
+        if (status != MX_OK)
+            return status;
+    }
+
+    // Setup EL2 for all online CPUs.
+    mp_cpu_mask_t cpu_mask = percpu_exec(el2_on_task, &el2_stacks);
+    if (cpu_mask != mp_get_online_mask()) {
+        mp_sync_exec(MP_IPI_TARGET_MASK, cpu_mask, el2_off_task, nullptr);
+        return MX_ERR_NOT_SUPPORTED;
+    }
+
+    el2_cpu_state->el2_stacks_ = fbl::move(el2_stacks);
+    *out = fbl::move(el2_cpu_state);
+    return MX_OK;
+}
+
+El2CpuState::~El2CpuState() {
+    mp_sync_exec(MP_IPI_TARGET_ALL, 0, el2_off_task, nullptr);
+}
+
+mx_status_t alloc_vmid(uint8_t* vmid) {
+    fbl::AutoLock lock(&el2_mutex);
+    if (num_guests == 0) {
+        mx_status_t status = El2CpuState::Create(&el2_cpu_state);
+        if (status != MX_OK)
+            return status;
+    }
+    num_guests++;
+    return el2_cpu_state->AllocId(vmid);
+}
+
+mx_status_t free_vmid(uint8_t vmid) {
+    fbl::AutoLock lock(&el2_mutex);
+    mx_status_t status = el2_cpu_state->FreeId(vmid);
+    if (status != MX_OK)
+        return status;
+    num_guests--;
+    if (num_guests == 0)
+        el2_cpu_state.reset();
+    return MX_OK;
+}
@@ -0,0 +1,40 @@
+// Copyright 2017 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+
+#pragma once
+
+#include <fbl/array.h>
+#include <fbl/unique_ptr.h>
+#include <hypervisor/cpu_state.h>
+
+/* Represents a stack for use with EL2. */
+class El2Stack {
+public:
+    El2Stack() = default;
+    ~El2Stack();
+    DISALLOW_COPY_ASSIGN_AND_MOVE(El2Stack);
+
+    mx_status_t Alloc();
+    mx_paddr_t Top() const;
+
+private:
+    mx_paddr_t stack_paddr_ = 0;
+};
+
+/* Maintains the EL2 state for each CPU. */
+class El2CpuState : public hypervisor::CpuState<uint8_t, 64> {
+public:
+    static mx_status_t Create(fbl::unique_ptr<El2CpuState>* out);
+    ~El2CpuState();
+
+private:
+    fbl::Array<El2Stack> el2_stacks_;
+
+    El2CpuState() = default;
+};
+
+mx_status_t alloc_vmid(uint8_t* vmid);
+mx_status_t free_vmid(uint8_t vmid);
@@ -0,0 +1,51 @@
+// Copyright 2017 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+
+#include <arch/hypervisor.h>
+#include <fbl/auto_call.h>
+#include <magenta/errors.h>
+#include <vm/vm_object.h>
+
+#include "el2_cpu_state_priv.h"
+
+// static
+mx_status_t Guest::Create(fbl::RefPtr<VmObject> physmem, fbl::unique_ptr<Guest>* out) {
+    uint8_t vmid;
+    mx_status_t status = alloc_vmid(&vmid);
+    if (status != MX_OK)
+        return status;
+    auto auto_call = fbl::MakeAutoCall([=]() { free_vmid(vmid); });
+
+    fbl::AllocChecker ac;
+    fbl::unique_ptr<Guest> guest(new (&ac) Guest(vmid));
+    if (!ac.check())
+        return MX_ERR_NO_MEMORY;
+
+    auto_call.cancel();
+    *out = fbl::move(guest);
+    // TODO(abdulla): We intentionally return MX_ERR_NOT_SUPPORTED, as the guest
+    // physical address space has not been wired up yet.
+    return MX_ERR_NOT_SUPPORTED;
+}
+
+Guest::Guest(uint8_t vmid)
+    : vmid_(vmid) {}
+
+Guest::~Guest() {
+    free_vmid(vmid_);
+}
+
+mx_status_t arch_guest_create(fbl::RefPtr<VmObject> physmem, fbl::unique_ptr<Guest>* guest) {
+    if (arm64_get_boot_el() < 2)
+        return MX_ERR_NOT_SUPPORTED;
+
+    return Guest::Create(fbl::move(physmem), guest);
+}
+
+mx_status_t arch_guest_set_trap(Guest* guest, uint32_t kind, mx_vaddr_t addr, size_t len,
+                                fbl::RefPtr<PortDispatcher> port, uint64_t key) {
+    return MX_ERR_NOT_SUPPORTED;
+}
@@ -1,5 +1,4 @@
-# Copyright 2016 The Fuchsia Authors
-# Copyright (c) 2008-2015 Travis Geiselbrecht
+# Copyright 2017 The Fuchsia Authors
 #
 # Use of this source code is governed by a MIT-style
 # license that can be found in the LICENSE file or at
@@ -9,9 +8,10 @@ LOCAL_DIR := $(GET_LOCAL_DIR)

 MODULE := $(LOCAL_DIR)

-MODULE_DEPS += kernel/lib/gfx
-
-MODULE_SRCS += \
-	$(LOCAL_DIR)/font.c
+MODULE_SRCS := \
+	$(LOCAL_DIR)/el2.S \
+	$(LOCAL_DIR)/el2_cpu_state.cpp \
+	$(LOCAL_DIR)/guest.cpp \
+	$(LOCAL_DIR)/vcpu.cpp \

 include make/module.mk
@@ -0,0 +1,24 @@
+// Copyright 2017 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+
+#include <arch/hypervisor.h>
+#include <magenta/errors.h>
+
+mx_status_t arch_vcpu_resume(Vcpu* vcpu, mx_port_packet_t* packet) {
+    return MX_ERR_NOT_SUPPORTED;
+}
+
+mx_status_t arch_vcpu_interrupt(Vcpu* vcpu, uint32_t interrupt) {
+    return MX_ERR_NOT_SUPPORTED;
+}
+
+mx_status_t arch_vcpu_read_state(const Vcpu* vcpu, uint32_t kind, void* buffer, uint32_t len) {
+    return MX_ERR_NOT_SUPPORTED;
+}
+
+mx_status_t arch_vcpu_write_state(Vcpu* vcpu, uint32_t kind, const void* buffer, uint32_t len) {
+    return MX_ERR_NOT_SUPPORTED;
+}
@@ -90,7 +90,7 @@ struct arch_exception_context {
 };

 struct thread;
-extern void arm64_exception_base(void);
+extern void arm64_el1_exception_base(void);
 void arm64_el3_to_el1(void);
 void arm64_sync_exception(struct arm64_iframe_long *iframe, uint exception_flags);

@@ -8,19 +8,21 @@

 #ifndef ASSEMBLY

+#include <kernel/atomic.h>
+
 __BEGIN_CDECLS

 // override of some routines
 static inline void arch_enable_ints(void)
 {
-    CF;
+    atomic_signal_fence();
    __asm__ volatile("msr daifclr, #2" ::: "memory");
 }

 static inline void arch_disable_ints(void)
 {
    __asm__ volatile("msr daifset, #2" ::: "memory");
-    CF;
+    atomic_signal_fence();
 }

 static inline bool arch_ints_disabled(void)
@@ -35,14 +37,14 @@ static inline bool arch_ints_disabled(void)

 static inline void arch_enable_fiqs(void)
 {
-    CF;
+    atomic_signal_fence();
    __asm__ volatile("msr daifclr, #1" ::: "memory");
 }

 static inline void arch_disable_fiqs(void)
 {
    __asm__ volatile("msr daifset, #1" ::: "memory");
-    CF;
+    atomic_signal_fence();
 }

 // XXX
@@ -8,13 +8,14 @@
 #pragma once

 #include <arch/arm64/mmu.h>
-#include <kernel/vm/arch_vm_aspace.h>
+#include <vm/arch_vm_aspace.h>
 #include <magenta/compiler.h>
-#include <mxtl/canary.h>
+#include <fbl/canary.h>
+#include <fbl/mutex.h>

 class ArmArchVmAspace final : public ArchVmAspaceInterface {
 public:
-    ArmArchVmAspace() {}
+    ArmArchVmAspace();
    virtual ~ArmArchVmAspace();

    status_t Init(vaddr_t base, size_t size, uint mmu_flags) override;
@@ -46,41 +47,43 @@ private:

    // Page table management.
    volatile pte_t* GetPageTable(vaddr_t index, uint page_size_shift,
-                                 volatile pte_t* page_table);
+                                 volatile pte_t* page_table) TA_REQ(lock_);

-    status_t AllocPageTable(paddr_t* paddrp, uint page_size_shift);
+    status_t AllocPageTable(paddr_t* paddrp, uint page_size_shift) TA_REQ(lock_);

-    void FreePageTable(void* vaddr, paddr_t paddr, uint page_size_shift);
+    void FreePageTable(void* vaddr, paddr_t paddr, uint page_size_shift) TA_REQ(lock_);

    ssize_t MapPageTable(vaddr_t vaddr_in, vaddr_t vaddr_rel_in,
                         paddr_t paddr_in, size_t size_in, pte_t attrs,
                         uint index_shift, uint page_size_shift,
-                         volatile pte_t* page_table, uint asid);
+                         volatile pte_t* page_table, uint asid) TA_REQ(lock_);

    ssize_t UnmapPageTable(vaddr_t vaddr, vaddr_t vaddr_rel, size_t size,
                           uint index_shift, uint page_size_shift,
-                           volatile pte_t* page_table, uint asid);
+                           volatile pte_t* page_table, uint asid) TA_REQ(lock_);

    int ProtectPageTable(vaddr_t vaddr_in, vaddr_t vaddr_rel_in, size_t size_in,
                         pte_t attrs, uint index_shift, uint page_size_shift,
-                         volatile pte_t* page_table, uint asid);
+                         volatile pte_t* page_table, uint asid) TA_REQ(lock_);

    ssize_t MapPages(vaddr_t vaddr, paddr_t paddr, size_t size, pte_t attrs,
                     vaddr_t vaddr_base, uint top_size_shift, uint top_index_shift,
                     uint page_size_shift, volatile pte_t* top_page_table,
-                     uint asid);
+                     uint asid) TA_REQ(lock_);

    ssize_t UnmapPages(vaddr_t vaddr, size_t size, vaddr_t vaddr_base,
                       uint top_size_shift, uint top_index_shift,
                       uint page_size_shift, volatile pte_t* top_page_table,
-                       uint asid);
+                       uint asid) TA_REQ(lock_);

    status_t ProtectPages(vaddr_t vaddr, size_t size, pte_t attrs,
                          vaddr_t vaddr_base, uint top_size_shift,
                          uint top_index_shift, uint page_size_shift,
-                          volatile pte_t* top_page_table, uint asid);
+                          volatile pte_t* top_page_table, uint asid) TA_REQ(lock_);

-    mxtl::Canary<mxtl::magic("VAAS")> canary_;
+    fbl::Canary<fbl::magic("VAAS")> canary_;
+
+    fbl::Mutex lock_;

    uint16_t asid_ = 0;

--- a/Mostrar Mais
+++ b/Mostrar Mais