WIP moving cpu affinity to a mask

Change-Id: I5dfc9f8e1c23d63ed749c84bc3dfa4118470b8e6
[kernel][mp] add new header with types and routines to deal with cpu numbers
2017-09-07 14:33:09 -07:00 · 2017-09-07 14:33:08 -07:00 · 2017-09-07 14:30:48 -07:00 · 2017-09-07 14:30:48 -07:00
@@ -52,15 +52,15 @@ void clock_tests(void) {
        printf("%d\n", i + 1);
    }

-    int old_affinity = thread_pinned_cpu(get_current_thread());
+    cpu_mask_t old_affinity = get_current_thread()->cpu_affinity;

-    for (int cpu = 0; cpu < SMP_MAX_CPUS; cpu++) {
+    for (cpu_num_t cpu = 0; cpu < SMP_MAX_CPUS; cpu++) {
        if (!mp_is_cpu_online(cpu))
            continue;

        printf("measuring cpu clock against current_time() on cpu %u\n", cpu);

-        thread_set_pinned_cpu(get_current_thread(), cpu);
+        thread_set_cpu_affinity(get_current_thread(), cpu_num_to_mask(cpu));
        mp_reschedule(MP_IPI_TARGET_MASK, 1u << cpu, 0);
        thread_yield();

@@ -74,7 +74,7 @@ void clock_tests(void) {
        }
    }

-    thread_set_pinned_cpu(get_current_thread(), old_affinity);
+    thread_set_cpu_affinity(get_current_thread(), old_affinity);
    mp_reschedule(MP_IPI_TARGET_ALL_BUT_LOCAL, 0, 0);
    thread_yield();
 }
@@ -344,7 +344,7 @@ static void preempt_test(void) {
    for (int i = 0; i < num_threads; i++) {
        thread_t* t = thread_create("preempt tester", &preempt_tester, NULL, LOW_PRIORITY, DEFAULT_STACK_SIZE);
        thread_set_real_time(t);
-        thread_set_pinned_cpu(t, 0);
+        thread_set_cpu_affinity(t, cpu_num_to_mask(0));
        thread_detach_and_resume(t);
    }

@@ -56,7 +56,7 @@ static void timer_test_all_cpus(void) {
            printf("failed to create thread for cpu %d\n", i);
            return;
        }
-        thread_set_pinned_cpu(timer_threads[i], i);
+        thread_set_cpu_affinity(timer_threads[i], cpu_num_to_mask(i));
        thread_resume(timer_threads[i]);
    }
    uint joined = 0;
@@ -75,7 +75,7 @@ static inline void arm64_write_percpu_u32(size_t offset, uint32_t val) {
            : "memory");
 }

-static inline uint arch_curr_cpu_num(void) {
+static inline cpu_num_t arch_curr_cpu_num(void) {
    return arm64_read_percpu_u32(offsetof(struct arm64_percpu, cpu_num));
 }

@@ -64,7 +64,7 @@ static uint arch_curr_cpu_num_slow() {
    return arm64_cpu_map[cluster][cpu];
 }

-status_t arch_mp_send_ipi(mp_ipi_target_t target, mp_cpu_mask_t mask, mp_ipi_t ipi) {
+status_t arch_mp_send_ipi(mp_ipi_target_t target, cpu_mask_t mask, mp_ipi_t ipi) {
    LTRACEF("target %d mask %#x, ipi %d\n", target, mask, ipi);

    // translate the high level target + mask mechanism into just a mask
@@ -72,7 +72,7 @@ status_t arch_mp_send_ipi(mp_ipi_target_t target, mp_cpu_mask_t mask, mp_ipi_t i
        mask = (1ul << SMP_MAX_CPUS) - 1;
    } else if (target == MP_IPI_TARGET_ALL) {
        mask = (1ul << SMP_MAX_CPUS) - 1;
-        mask &= ~(1u << arch_curr_cpu_num());
+        mask &= ~cpu_num_to_mask(arch_curr_cpu_num());
    }

    return interrupt_send_ipi(mask, ipi);
@@ -203,32 +203,30 @@ mx_status_t AutoVmcs::SetControl(VmcsField32 controls, uint64_t true_msr, uint64
    return MX_OK;
 }

-static uint cpu_of(uint16_t vpid) {
+static cpu_num_t cpu_of(uint16_t vpid) {
    return vpid % arch_max_num_cpus();
 }

 static void pin_thread(thread_t* thread, uint16_t vpid) {
-    uint cpu = cpu_of(vpid);
-    if (thread_pinned_cpu(thread) != static_cast<int>(cpu))
-        thread_set_pinned_cpu(thread, cpu);
-    if (arch_curr_cpu_num() != cpu)
-        thread_reschedule();
+    cpu_num_t cpu = cpu_of(vpid);
+
+    thread_set_cpu_affinity(thread, cpu_num_to_mask(cpu));
 }

 static bool check_pinned_cpu_invariant(const thread_t* thread, uint16_t vpid) {
-    uint cpu = cpu_of(vpid);
+    cpu_num_t cpu = cpu_of(vpid);
    return thread == get_current_thread() &&
-           thread_pinned_cpu(thread) == static_cast<int>(cpu) &&
+           thread->cpu_affinity & cpu_num_to_mask(cpu) &&
           arch_curr_cpu_num() == cpu;
 }

 AutoPin::AutoPin(const Vcpu* vcpu)
-    : thread_(get_current_thread()), prev_cpu_(thread_pinned_cpu(thread_)) {
+    : thread_(get_current_thread()), prev_cpu_mask_(thread_->cpu_affinity) {
    pin_thread(thread_, vcpu->vpid());
 }

 AutoPin::~AutoPin() {
-    thread_set_pinned_cpu(thread_, prev_cpu_);
+    thread_set_cpu_affinity(thread_, prev_cpu_mask_);
 }

 static uint64_t ept_pointer(paddr_t pml4_address) {
@@ -207,5 +207,5 @@ public:

 private:
    thread_t* thread_;
-    int prev_cpu_;
+    cpu_mask_t prev_cpu_mask_;
 };
@@ -120,7 +120,7 @@ VmxPage::~VmxPage() {

 struct vmxon_context {
    fbl::Array<VmxPage>* vmxon_pages;
-    fbl::atomic<mp_cpu_mask_t> cpu_mask;
+    fbl::atomic<cpu_mask_t> cpu_mask;

    vmxon_context(fbl::Array<VmxPage>* vp)
        : vmxon_pages(vp), cpu_mask(0) {}
@@ -230,9 +230,9 @@ mx_status_t VmxCpuState::Create(fbl::unique_ptr<VmxCpuState>* out) {

    // Enable VMX for all online CPUs.
    vmxon_context vmxon_ctx(&vmxon_pages);
-    mp_cpu_mask_t online_mask = mp_get_online_mask();
+    cpu_mask_t online_mask = mp_get_online_mask();
    mp_sync_exec(MP_IPI_TARGET_MASK, online_mask, vmxon_task, &vmxon_ctx);
-    mp_cpu_mask_t cpu_mask = vmxon_ctx.cpu_mask.load();
+    cpu_mask_t cpu_mask = vmxon_ctx.cpu_mask.load();
    if (cpu_mask != online_mask) {
        mp_sync_exec(MP_IPI_TARGET_MASK, cpu_mask, vmxoff_task, nullptr);
        return MX_ERR_NOT_SUPPORTED;
@@ -6,11 +6,11 @@
 #pragma once

 #include <magenta/compiler.h>
-#include <kernel/mp.h>
+#include <kernel/cpu.h>

 __BEGIN_CDECLS

 void x86_mmu_mem_type_init(void);
-void x86_pat_sync(mp_cpu_mask_t targets);
+void x86_pat_sync(cpu_mask_t targets);

 __END_CDECLS
@@ -63,7 +63,7 @@ struct x86_percpu {
    uintptr_t gpf_return_target;

    /* CPU number */
-    uint32_t cpu_num;
+    cpu_num_t cpu_num;

    /* This CPU's default TSS */
    tss_t default_tss __ALIGNED(16);
@@ -103,7 +103,7 @@ static inline struct x86_percpu *x86_get_percpu(void)
    return (struct x86_percpu *)x86_read_gs_offset64(PERCPU_DIRECT_OFFSET);
 }

-static inline uint arch_curr_cpu_num(void)
+static inline cpu_num_t arch_curr_cpu_num(void)
 {
    return x86_get_percpu()->cpu_num;
 }
@@ -173,7 +173,7 @@ static void x86_tlb_invalidate_page(X86ArchVmAspace* aspace, vaddr_t vaddr,
     * the write to the page table, so it will see the change.  In the latter
     * case, it will get a spurious request to flush. */
    mp_ipi_target_t target;
-    mp_cpu_mask_t target_mask = 0;
+    cpu_mask_t target_mask = 0;
    if (global_page || aspace == nullptr) {
        target = MP_IPI_TARGET_ALL;
    } else {
@@ -1281,7 +1281,7 @@ X86ArchVmAspace::X86ArchVmAspace() {}
 * Fill in the high level x86 arch aspace structure and allocating a top level page table.
 */
 status_t X86ArchVmAspace::Init(vaddr_t base, size_t size, uint mmu_flags) {
-    static_assert(sizeof(mp_cpu_mask_t) == sizeof(active_cpus_), "err");
+    static_assert(sizeof(cpu_mask_t) == sizeof(active_cpus_), "err");
    canary_.Assert();

    fbl::AutoLock a(&lock_);
@@ -1375,7 +1375,7 @@ status_t X86ArchVmAspace::Destroy() {
 }

 void X86ArchVmAspace::ContextSwitch(X86ArchVmAspace* old_aspace, X86ArchVmAspace* aspace) {
-    mp_cpu_mask_t cpu_bit = 1U << arch_curr_cpu_num();
+    cpu_mask_t cpu_bit = cpu_num_to_mask(arch_curr_cpu_num());
    if (aspace != nullptr) {
        aspace->canary_.Assert();
        LTRACEF_LEVEL(3, "switching to aspace %p, pt %#" PRIXPTR "\n", aspace, aspace->pt_phys_);
@@ -13,6 +13,7 @@
 #include <arch/x86/mmu.h>
 #include <arch/x86/mmu_mem_types.h>
 #include <arch/x86/registers.h>
+#include <kernel/mp.h>
 #include <lib/console.h>

 /* address widths from mmu.c */
@@ -119,7 +120,7 @@ void x86_mmu_mem_type_init(void)
 *
 * This algorithm is based on section 11.11.8 of Intel 3A
 */
-void x86_pat_sync(mp_cpu_mask_t targets)
+void x86_pat_sync(cpu_mask_t targets)
 {
    targets &= mp_get_online_mask();

@@ -77,7 +77,7 @@ status_t x86_allocate_ap_structures(uint32_t *apic_ids, uint8_t cpu_count)
    return MX_OK;
 }

-void x86_init_percpu(uint cpu_num)
+void x86_init_percpu(cpu_num_t cpu_num)
 {
    struct x86_percpu *const percpu =
        cpu_num == 0 ? &bp_percpu : &ap_percpus[cpu_num - 1];
@@ -217,7 +217,7 @@ int x86_apic_id_to_cpu_num(uint32_t apic_id)
    return -1;
 }

-status_t arch_mp_send_ipi(mp_ipi_target_t target, mp_cpu_mask_t mask, mp_ipi_t ipi)
+status_t arch_mp_send_ipi(mp_ipi_target_t target, cpu_mask_t mask, mp_ipi_t ipi)
 {
    uint8_t vector = 0;
    switch (ipi) {
@@ -244,7 +244,7 @@ status_t arch_mp_send_ipi(mp_ipi_target_t target, mp_cpu_mask_t mask, mp_ipi_t i

    ASSERT(x86_num_cpus <= sizeof(mask) * CHAR_BIT);

-    mp_cpu_mask_t remaining = mask;
+    cpu_mask_t remaining = mask;
    uint cpu_id = 0;
    while (remaining && cpu_id < x86_num_cpus) {
        if (remaining & 1) {
@@ -272,7 +272,7 @@ static enum handler_return gic_handle_fiq(struct iframe *frame)
    PANIC_UNIMPLEMENTED;
 }

-static status_t gic_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi) {
+static status_t gic_send_ipi(cpu_mask_t target, mp_ipi_t ipi) {
    uint gic_ipi_num = ipi + ipi_base;

    /* filter out targets outside of the range of cpus we care about */
@@ -294,7 +294,7 @@ static enum handler_return gic_handle_fiq(iframe* frame) {
    PANIC_UNIMPLEMENTED;
 }

-static status_t gic_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi) {
+static status_t gic_send_ipi(cpu_mask_t target, mp_ipi_t ipi) {
    uint gic_ipi_num = ipi + ipi_base;

    /* filter out targets outside of the range of cpus we care about */
@@ -221,7 +221,7 @@ static enum handler_return bcm28xx_handle_fiq(struct arm64_iframe_short* frame)
 }


-static status_t bcm28xx_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi) {
+static status_t bcm28xx_send_ipi(cpu_mask_t target, mp_ipi_t ipi) {
    /* filter out targets outside of the range of cpus we care about */
    target &= ((1UL << SMP_MAX_CPUS) - 1);
    if (target != 0) {
@@ -46,7 +46,7 @@ bool is_valid_interrupt(unsigned int vector, uint32_t flags);
 unsigned int remap_interrupt(unsigned int vector);

 /* sends an inter-processor interrupt */
-status_t interrupt_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi);
+status_t interrupt_send_ipi(cpu_mask_t target, mp_ipi_t ipi);

 /* performs per-cpu initialization for the interrupt controller */
 void interrupt_init_percpu(void);
@@ -30,7 +30,7 @@ struct pdev_interrupt_ops {
                           enum interrupt_polarity* pol);
    bool (*is_valid)(unsigned int vector, uint32_t flags);
    unsigned int (*remap)(unsigned int vector);
-    status_t (*send_ipi)(mp_cpu_mask_t target, mp_ipi_t ipi);
+    status_t (*send_ipi)(cpu_mask_t target, mp_ipi_t ipi);
    void (*init_percpu_early)(void);
    void (*init_percpu)(void);
    enum handler_return (*handle_irq)(iframe* frame);
@@ -66,7 +66,7 @@ static unsigned int default_remap(unsigned int vector) {
    return 0;
 }

-static status_t default_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi) {
+static status_t default_send_ipi(cpu_mask_t target, mp_ipi_t ipi) {
    return MX_ERR_NOT_CONFIGURED;
 }

@@ -130,7 +130,7 @@ unsigned int remap_interrupt(unsigned int vector) {
    return intr_ops->remap(vector);
 }

-status_t interrupt_send_ipi(mp_cpu_mask_t target, mp_ipi_t ipi) {
+status_t interrupt_send_ipi(cpu_mask_t target, mp_ipi_t ipi) {
    return intr_ops->send_ipi(target, ipi);
 }

@@ -8,35 +8,36 @@
 #pragma once

 #include <sys/types.h>
+#include <kernel/cpu.h>
 #include <kernel/mp.h>

 __BEGIN_CDECLS

 /* send inter processor interrupt, if supported */
-status_t arch_mp_send_ipi(mp_ipi_target_t, mp_cpu_mask_t mask, mp_ipi_t ipi);
+status_t arch_mp_send_ipi(mp_ipi_target_t, cpu_mask_t mask, mp_ipi_t ipi);

 /* Bring a CPU up and enter it into the scheduler */
-status_t platform_mp_cpu_hotplug(uint cpu_id);
+status_t platform_mp_cpu_hotplug(cpu_num_t cpu_id);

 /* Prepare for CPU unplug.  The platform may want to shift
 * around external interrupts at this time. */
-status_t platform_mp_prep_cpu_unplug(uint cpu_id);
+status_t platform_mp_prep_cpu_unplug(cpu_num_t cpu_id);

 /* shutdown the specified CPU.  called after it is no longer
 * being scheduled on.  */
-status_t platform_mp_cpu_unplug(uint cpu_id);
+status_t platform_mp_cpu_unplug(cpu_num_t cpu_id);

 /* Should be invoked by platform_mp_cpu_hotplug to ask the arch
 * to bring a CPU up and enter it into the scheduler */
-status_t arch_mp_cpu_hotplug(uint cpu_id);
+status_t arch_mp_cpu_hotplug(cpu_num_t cpu_id);

 /* Should be invoked by platform_mp_prep_cpu_unplug to ask the
 * arch to do whatever it needs to do to stop the CPU */
-status_t arch_mp_prep_cpu_unplug(uint cpu_id);
+status_t arch_mp_prep_cpu_unplug(cpu_num_t cpu_id);

 /* Should be invoked by platform_mp_cpu_unplug to ask the
 * arch to do whatever it needs to do to stop the CPU */
-status_t arch_mp_cpu_unplug(uint cpu_id);
+status_t arch_mp_cpu_unplug(cpu_num_t cpu_id);

 void arch_mp_init_percpu(void);

@@ -16,6 +16,7 @@

 #include <arch/defines.h>
 #include <kernel/atomic.h>
+#include <kernel/cpu.h>
 #include <magenta/compiler.h>
 #include <stdbool.h>
 #include <stddef.h>
@@ -32,7 +33,7 @@ static bool arch_in_int_handler(void);

 static uint64_t arch_cycle_count(void);

-static uint arch_curr_cpu_num(void);
+static cpu_num_t arch_curr_cpu_num(void);
 static uint arch_max_num_cpus(void);

 /* Use to align structures on cache lines to avoid cpu aliasing. */
@@ -0,0 +1,46 @@
+// Copyright 2016 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+
+#pragma once
+
+#include <assert.h>
+#include <stdint.h>
+
+// types and routines for dealing with lists of cpus and cpu masks
+
+typedef uint32_t cpu_mask_t;
+typedef uint32_t cpu_num_t;
+
+static_assert(SMP_MAX_CPUS <= sizeof(cpu_mask_t) * CHAR_BIT, "");
+
+#define INVALID_CPU ((cpu_num_t)-1)
+#define CPU_MASK_ALL ((cpu_mask_t)-1)
+
+static inline bool is_valid_cpu_num(cpu_num_t num) {
+    return (num < SMP_MAX_CPUS);
+}
+
+static inline cpu_mask_t cpu_num_to_mask(cpu_num_t num) {
+    if (!is_valid_cpu_num(num))
+        return 0;
+
+    return ((cpu_mask_t)1u << num);
+}
+
+static inline cpu_num_t highest_cpu_set(cpu_mask_t mask) {
+    if (mask == 0)
+        return 0;
+
+    return (cpu_num_t)(sizeof(cpu_mask_t) * CHAR_BIT - 1) - __builtin_clz(mask);
+}
+
+static inline cpu_num_t lowest_cpu_set(cpu_mask_t mask) {
+    if (mask == 0)
+        return 0;
+
+    return (cpu_num_t)(__builtin_ctz(mask));
+}
+
@@ -7,21 +7,17 @@

 #pragma once

+#include <kernel/cpu.h>
 #include <kernel/mutex.h>
-#include <kernel/thread.h>
-#include <limits.h>
 #include <magenta/compiler.h>
 #include <stdbool.h>
 #include <stdint.h>

 __BEGIN_CDECLS

-typedef uint32_t mp_cpu_mask_t;
 typedef void (*mp_ipi_task_func_t)(void* context);
 typedef void (*mp_sync_task_t)(void* context);

-static_assert(SMP_MAX_CPUS <= sizeof(mp_cpu_mask_t) * CHAR_BIT, "");
-
 /* by default, mp_mbx_reschedule does not signal to cpus that are running realtime
 * threads. Override this behavior.
 */
@@ -46,8 +42,8 @@ typedef enum {

 void mp_init(void);

-void mp_reschedule(mp_ipi_target_t, mp_cpu_mask_t mask, uint flags);
-void mp_sync_exec(mp_ipi_target_t, mp_cpu_mask_t mask, mp_sync_task_t task, void* context);
+void mp_reschedule(mp_ipi_target_t, cpu_mask_t mask, uint flags);
+void mp_sync_exec(mp_ipi_target_t, cpu_mask_t mask, mp_sync_task_t task, void* context);

 status_t mp_hotplug_cpu(uint cpu_id);
 status_t mp_unplug_cpu(uint cpu_id);
@@ -68,13 +64,13 @@ struct mp_ipi_task {
 /* global mp state to track what the cpus are up to */
 struct mp_state {
    /* cpus that are currently online */
-    volatile mp_cpu_mask_t online_cpus;
+    volatile cpu_mask_t online_cpus;
    /* cpus that are currently schedulable */
-    volatile mp_cpu_mask_t active_cpus;
+    volatile cpu_mask_t active_cpus;

    /* only safely accessible with thread lock held */
-    mp_cpu_mask_t idle_cpus;
-    mp_cpu_mask_t realtime_cpus;
+    cpu_mask_t idle_cpus;
+    cpu_mask_t realtime_cpus;

    spin_lock_t ipi_task_lock;
    /* list of outstanding tasks for CPUs to execute.  Should only be
@@ -90,48 +86,48 @@ extern struct mp_state mp;
 void mp_set_curr_cpu_online(bool online);
 void mp_set_curr_cpu_active(bool active);

-static inline int mp_is_cpu_active(uint cpu) {
-    return atomic_load((int*)&mp.active_cpus) & (1 << cpu);
+static inline int mp_is_cpu_active(cpu_num_t cpu) {
+    return atomic_load((int*)&mp.active_cpus) & cpu_num_to_mask(cpu);
 }

-static inline int mp_is_cpu_idle(uint cpu) {
-    return mp.idle_cpus & (1 << cpu);
+static inline int mp_is_cpu_idle(cpu_num_t cpu) {
+    return mp.idle_cpus & cpu_num_to_mask(cpu);
 }

-static inline int mp_is_cpu_online(uint cpu) {
-    return mp.online_cpus & (1 << cpu);
+static inline int mp_is_cpu_online(cpu_num_t cpu) {
+    return mp.online_cpus & cpu_num_to_mask(cpu);
 }

 /* must be called with the thread lock held */
-static inline void mp_set_cpu_idle(uint cpu) {
-    mp.idle_cpus |= 1U << cpu;
+static inline void mp_set_cpu_idle(cpu_num_t cpu) {
+    mp.idle_cpus |= cpu_num_to_mask(cpu);
 }

-static inline void mp_set_cpu_busy(uint cpu) {
-    mp.idle_cpus &= ~(1U << cpu);
+static inline void mp_set_cpu_busy(cpu_num_t cpu) {
+    mp.idle_cpus &= ~cpu_num_to_mask(cpu);
 }

-static inline mp_cpu_mask_t mp_get_idle_mask(void) {
+static inline cpu_mask_t mp_get_idle_mask(void) {
    return mp.idle_cpus;
 }

-static inline mp_cpu_mask_t mp_get_active_mask(void) {
+static inline cpu_mask_t mp_get_active_mask(void) {
    return atomic_load((int*)&mp.active_cpus);
 }

-static inline mp_cpu_mask_t mp_get_online_mask(void) {
+static inline cpu_mask_t mp_get_online_mask(void) {
    return mp.online_cpus;
 }

-static inline void mp_set_cpu_realtime(uint cpu) {
-    mp.realtime_cpus |= 1U << cpu;
+static inline void mp_set_cpu_realtime(cpu_num_t cpu) {
+    mp.realtime_cpus |= cpu_num_to_mask(cpu);
 }

-static inline void mp_set_cpu_non_realtime(uint cpu) {
-    mp.realtime_cpus &= ~(1U << cpu);
+static inline void mp_set_cpu_non_realtime(cpu_num_t cpu) {
+    mp.realtime_cpus &= ~cpu_num_to_mask(cpu);
 }

-static inline mp_cpu_mask_t mp_get_realtime_mask(void) {
+static inline cpu_mask_t mp_get_realtime_mask(void) {
    return mp.realtime_cpus;
 }

@@ -22,6 +22,10 @@ struct percpu {
    /* per cpu preemption timer */
    timer_t preempt_timer;

+    /* per cpu run queue */
+    struct list_node run_queue[NUM_PRIORITIES];
+    uint32_t run_queue_bitmap;
+
    /* thread/cpu level statistics */
    struct cpu_stats stats;

@@ -7,6 +7,7 @@
 #pragma once

 #include <kernel/thread.h>
+#include <magenta/compiler.h>
 #include <list.h>
 #include <stdbool.h>

@@ -15,9 +16,13 @@
 void sched_init_early(void);

 void sched_block(void);
-void sched_unblock(thread_t* t);
-void sched_unblock_list(struct list_node* list);
 void sched_yield(void);
 void sched_preempt(void);
 void sched_reschedule(void);
 void sched_resched_internal(void);
+void sched_unblock_idle(thread_t *t);
+void sched_migrate(thread_t *t);
+
+/* return true if the thread was placed on the current cpu's run queue */
+bool sched_unblock(thread_t *t) __WARN_UNUSED_RESULT;
+bool sched_unblock_list(struct list_node *list) __WARN_UNUSED_RESULT;
@@ -11,6 +11,7 @@
 #include <arch/ops.h>
 #include <arch/thread.h>
 #include <debug.h>
+#include <kernel/cpu.h>
 #include <kernel/spinlock.h>
 #include <kernel/wait.h>
 #include <list.h>
@@ -91,8 +92,9 @@ typedef struct thread {
    int base_priority;
    int priority_boost;

-    uint last_cpu;  /* last/current cpu the thread is running on */
-    int pinned_cpu; /* only run on pinned_cpu if >= 0 */
+    cpu_num_t curr_cpu;  /* current cpu the thread is either running on or in the ready queue */
+    cpu_num_t last_cpu;  /* last cpu the thread ran on, INVALID_CPU if it's never run */
+    cpu_mask_t cpu_affinity; /* mask of cpus that this thread can run on */

    /* pointer to the kernel address space this thread is associated with */
    struct vmm_aspace* aspace;
@@ -152,22 +154,6 @@ typedef struct thread {
 #endif
 } thread_t;

-static inline uint thread_last_cpu(const thread_t* t) {
-    return t->last_cpu;
-}
-
-static inline void thread_set_last_cpu(thread_t* t, uint c) {
-    t->last_cpu = c;
-}
-
-static inline int thread_pinned_cpu(const thread_t* t) {
-    return t->pinned_cpu;
-}
-
-static inline void thread_set_pinned_cpu(thread_t* t, int c) {
-    t->pinned_cpu = c;
-}
-
 /* thread priority */
 #define NUM_PRIORITIES (32)
 #define LOWEST_PRIORITY (0)
@@ -203,7 +189,11 @@ status_t thread_suspend(thread_t*);
 void thread_signal_policy_exception(void);
 void thread_exit(int retcode) __NO_RETURN;
 void thread_forget(thread_t*);
-void thread_migrate_cpu(const uint target_cpuid);
+
+void thread_set_cpu_affinity(thread_t* t, cpu_mask_t mask);
+
+// TODO: rename to something better
+void thread_migrate_cpu(cpu_num_t target_cpuid);

 status_t thread_detach(thread_t* t);
 status_t thread_join(thread_t* t, int* retcode, lk_time_t deadline);
@@ -55,12 +55,6 @@ int wait_queue_wake_one(wait_queue_t*, bool reschedule, status_t wait_queue_erro
 int wait_queue_wake_all(wait_queue_t*, bool reschedule, status_t wait_queue_error);
 struct thread* wait_queue_dequeue_one(wait_queue_t* wait, status_t wait_queue_error);

-/*
- * remove the thread from whatever wait queue it's in.
- * return an error if the thread is not currently blocked (or is the current thread)
- */
-status_t thread_unblock_from_wait_queue(struct thread* t, status_t wait_queue_error);
-
 /* is the wait queue currently empty */
 bool wait_queue_is_empty(wait_queue_t*);

@@ -40,8 +40,8 @@ void mp_init(void) {
    }
 }

-void mp_reschedule(mp_ipi_target_t target, mp_cpu_mask_t mask, uint flags) {
-    uint local_cpu = arch_curr_cpu_num();
+void mp_reschedule(mp_ipi_target_t target, cpu_mask_t mask, uint flags) {
+    const cpu_num_t local_cpu = arch_curr_cpu_num();

    LTRACEF("local %u, target %u, mask %#x\n", local_cpu, target, mask);

@@ -56,7 +56,7 @@ void mp_reschedule(mp_ipi_target_t target, mp_cpu_mask_t mask, uint flags) {

            /* mask out cpus that are not active and the local cpu */
            mask &= mp.active_cpus;
-            mask &= ~(1U << local_cpu);
+            mask &= ~cpu_num_to_mask(local_cpu);

            /* mask out cpus that are currently running realtime code */
            if ((flags & MP_RESCHEDULE_FLAG_REALTIME) == 0) {
@@ -74,7 +74,7 @@ struct mp_sync_context {
    mp_sync_task_t task;
    void* task_context;
    /* Mask of which CPUs need to finish the task */
-    volatile mp_cpu_mask_t outstanding_cpus;
+    volatile cpu_mask_t outstanding_cpus;
 };

 static void mp_sync_task(void* raw_context) {
@@ -82,7 +82,7 @@ static void mp_sync_task(void* raw_context) {
    context->task(context->task_context);
    /* use seq-cst atomic to ensure this update is not seen before the
     * side-effects of context->task */
-    atomic_and((int*)&context->outstanding_cpus, ~(1U << arch_curr_cpu_num()));
+    atomic_and((int*)&context->outstanding_cpus, ~cpu_num_to_mask(arch_curr_cpu_num()));
    arch_spinloop_signal();
 }

@@ -94,7 +94,7 @@ static void mp_sync_task(void* raw_context) {
 *
 * Interrupts must be disabled if calling with MP_IPI_TARGET_ALL_BUT_LOCAL as target
 */
-void mp_sync_exec(mp_ipi_target_t target, mp_cpu_mask_t mask, mp_sync_task_t task, void* context) {
+void mp_sync_exec(mp_ipi_target_t target, cpu_mask_t mask, mp_sync_task_t task, void* context) {
    uint num_cpus = arch_max_num_cpus();

    if (target == MP_IPI_TARGET_ALL) {
@@ -103,7 +103,7 @@ void mp_sync_exec(mp_ipi_target_t target, mp_cpu_mask_t mask, mp_sync_task_t tas
        /* targeting all other CPUs but the current one is hazardous
         * if the local CPU may be changed underneath us */
        DEBUG_ASSERT(arch_ints_disabled());
-        mask = mp_get_online_mask() & ~(1U << arch_curr_cpu_num());
+        mask = mp_get_online_mask() & ~cpu_num_to_mask(arch_curr_cpu_num());
    }

    /* Mask any offline CPUs from target list */
@@ -114,11 +114,11 @@ void mp_sync_exec(mp_ipi_target_t target, mp_cpu_mask_t mask, mp_sync_task_t tas
    arch_interrupt_save(&irqstate, SPIN_LOCK_FLAG_INTERRUPTS);
    smp_mb();

-    uint local_cpu = arch_curr_cpu_num();
+    const uint local_cpu = arch_curr_cpu_num();

    /* remove self from target lists, since no need to IPI ourselves */
-    bool targetting_self = !!(mask & (1U << local_cpu));
-    mask &= ~(1U << local_cpu);
+    bool targetting_self = !!(mask & cpu_num_to_mask(local_cpu));
+    mask &= ~cpu_num_to_mask(local_cpu);

    /* create tasks to enqueue (we need one per target due to each containing
     * a linked list node */
@@ -136,7 +136,7 @@ void mp_sync_exec(mp_ipi_target_t target, mp_cpu_mask_t mask, mp_sync_task_t tas

    /* enqueue tasks */
    spin_lock(&mp.ipi_task_lock);
-    mp_cpu_mask_t remaining = mask;
+    cpu_mask_t remaining = mask;
    uint cpu_id = 0;
    while (remaining && cpu_id < num_cpus) {
        if (remaining & 1) {
@@ -164,9 +164,9 @@ void mp_sync_exec(mp_ipi_target_t target, mp_cpu_mask_t mask, mp_sync_task_t tas
    while (1) {
        /* See comment in mp_unplug_trampoline about related CPU hotplug
         * guarantees. */
-        mp_cpu_mask_t outstanding = atomic_load_relaxed(
+        cpu_mask_t outstanding = atomic_load_relaxed(
            (int*)&sync_context.outstanding_cpus);
-        mp_cpu_mask_t online = mp_get_online_mask();
+        cpu_mask_t online = mp_get_online_mask();
        if ((outstanding & online) == 0) {
            break;
        }
@@ -294,7 +294,7 @@ status_t mp_unplug_cpu(uint cpu_id) {
    }

    /* Pin to the target CPU */
-    thread_set_pinned_cpu(t, cpu_id);
+    thread_set_cpu_affinity(t, cpu_num_to_mask(cpu_id));
    /* Set real time to cancel the pre-emption timer */
    thread_set_real_time(t);

@@ -331,23 +331,23 @@ cleanup_mutex:

 void mp_set_curr_cpu_online(bool online) {
    if (online) {
-        atomic_or((volatile int*)&mp.online_cpus, 1U << arch_curr_cpu_num());
+        atomic_or((volatile int*)&mp.online_cpus, cpu_num_to_mask(arch_curr_cpu_num()));
    } else {
-        atomic_and((volatile int*)&mp.online_cpus, ~(1U << arch_curr_cpu_num()));
+        atomic_and((volatile int*)&mp.online_cpus, ~cpu_num_to_mask(arch_curr_cpu_num()));
    }
 }

 void mp_set_curr_cpu_active(bool active) {
    if (active) {
-        atomic_or((volatile int*)&mp.active_cpus, 1U << arch_curr_cpu_num());
+        atomic_or((volatile int*)&mp.active_cpus, cpu_num_to_mask(arch_curr_cpu_num()));
    } else {
-        atomic_and((volatile int*)&mp.active_cpus, ~(1U << arch_curr_cpu_num()));
+        atomic_and((volatile int*)&mp.active_cpus, ~cpu_num_to_mask(arch_curr_cpu_num()));
    }
 }

 enum handler_return mp_mbx_generic_irq(void) {
    DEBUG_ASSERT(arch_ints_disabled());
-    uint local_cpu = arch_curr_cpu_num();
+    const cpu_num_t local_cpu = arch_curr_cpu_num();

    CPU_STATS_INC(generic_ipis);

@@ -366,13 +366,13 @@ enum handler_return mp_mbx_generic_irq(void) {
 }

 enum handler_return mp_mbx_reschedule_irq(void) {
-    uint cpu = arch_curr_cpu_num();
+    const cpu_num_t cpu = arch_curr_cpu_num();

    LTRACEF("cpu %u\n", cpu);

    CPU_STATS_INC(reschedule_ipis);

-    return (mp.active_cpus & (1U << cpu)) ? INT_RESCHEDULE : INT_NO_RESCHEDULE;
+    return (mp.active_cpus & cpu_num_to_mask(cpu)) ? INT_RESCHEDULE : INT_NO_RESCHEDULE;
 }

 __WEAK status_t arch_mp_cpu_hotplug(uint cpu_id) {
@@ -156,8 +156,8 @@ static inline void mutex_release_internal(mutex_t* m, bool reschedule, bool thre
    }

    // put the new thread back in the run queue and optionally reschedule locally
-    sched_unblock(t);
-    if (reschedule)
+    bool local_resched = sched_unblock(t);
+    if (reschedule && local_resched)
        sched_reschedule();

    // conditionally THREAD_UNLOCK
@@ -21,6 +21,7 @@
 #include <printf.h>
 #include <string.h>
 #include <target.h>
+#include <trace.h>

 /* disable priority boosting */
 #define NO_BOOST 0
@@ -38,6 +39,8 @@
 #define LOCAL_KTRACE2(probe, x, y)
 #endif

+#define LOCAL_TRACE 0
+
 #define DEBUG_THREAD_CONTEXT_SWITCH 0

 #define TRACE_CONTEXT_SWITCH(str, x...)  \
@@ -49,12 +52,8 @@
 /* threads get 10ms to run before they use up their time slice and the scheduler is invoked */
 #define THREAD_INITIAL_TIME_SLICE LK_MSEC(10)

-/* the run queue */
-static struct list_node run_queue[NUM_PRIORITIES];
-static uint32_t run_queue_bitmap;
-
 /* make sure the bitmap is large enough to cover our number of priorities */
-static_assert(NUM_PRIORITIES <= sizeof(run_queue_bitmap) * CHAR_BIT, "");
+static_assert(NUM_PRIORITIES <= sizeof(percpu[0].run_queue_bitmap) * CHAR_BIT, "");

 /* compute the effective priority of a thread */
 static int effec_priority(const thread_t* t) {
@@ -110,18 +109,18 @@ static void deboost_thread(thread_t* t, bool quantum_expiration) {
    t->priority_boost--;
 }

-/* pick a 'random' cpu */
-static mp_cpu_mask_t rand_cpu(const mp_cpu_mask_t mask) {
+/* pick a 'random' cpu out of the passed in mask of cpus */
+static cpu_mask_t rand_cpu(const cpu_mask_t mask) {
    if (unlikely(mask == 0))
        return 0;

    /* check that the mask passed in has at least one bit set in the online mask */
-    mp_cpu_mask_t online = mp_get_online_mask();
+    cpu_mask_t online = mp_get_online_mask();
    if (unlikely((mask & online) == 0))
        return 0;

    /* compute the highest online cpu */
-    uint highest_cpu = (sizeof(mp_cpu_mask_t) * CHAR_BIT - 1) - __builtin_clz(online);
+    cpu_mask_t highest_cpu = highest_cpu_set(online);

    /* not very random, round robins a bit through the mask until it gets a hit */
    for (;;) {
@@ -136,19 +135,26 @@ static mp_cpu_mask_t rand_cpu(const mp_cpu_mask_t mask) {
    }
 }

-/* find a cpu to wake up */
-static mp_cpu_mask_t find_cpu(thread_t* t) {
+/* find a cpu to wake up, 0 means local cpu */
+static cpu_mask_t find_cpu_mask(thread_t* t) {
    /* get the last cpu the thread ran on */
-    mp_cpu_mask_t last_ran_cpu_mask = (1u << thread_last_cpu(t));
+    cpu_mask_t last_ran_cpu_mask = cpu_num_to_mask(t->last_cpu);

    /* the current cpu */
-    mp_cpu_mask_t curr_cpu_mask = (1u << arch_curr_cpu_num());
+    cpu_mask_t curr_cpu_mask = cpu_num_to_mask(arch_curr_cpu_num());

-    /* get a list of idle cpus */
-    mp_cpu_mask_t idle_cpu_mask = mp_get_idle_mask();
+    /* the thread's affinity mask */
+    cpu_mask_t cpu_affinity = t->cpu_affinity;
+
+    LTRACEF_LEVEL(2, "last %#x curr %#x aff %#x name %s\n",
+            last_ran_cpu_mask, curr_cpu_mask, cpu_affinity, t->name);
+
+    /* get a list of idle cpus and mask off the ones that aren't in our affinity mask */
+    cpu_mask_t idle_cpu_mask = mp_get_idle_mask();
+    idle_cpu_mask &= cpu_affinity;
    if (idle_cpu_mask != 0) {
        if (idle_cpu_mask & curr_cpu_mask) {
-            /* the current cpu is idle, so run it here */
+            /* the current cpu is idle and within our affinity mask, so run it here */
            return 0;
        }

@@ -162,60 +168,69 @@ static mp_cpu_mask_t find_cpu(thread_t* t) {
    }

    /* no idle cpus */
-    if (last_ran_cpu_mask == curr_cpu_mask) {
-        /* the last cpu it ran on is us */
-        /* pick a random cpu that isn't the current one */
-        return rand_cpu(mp_get_online_mask() & ~(curr_cpu_mask));
-    } else {
-        /* pick the last cpu it ran on */
+
+    /* if the last cpu it ran on is in the affinity mask and not the current cpu, pick that */
+    if ((last_ran_cpu_mask & cpu_affinity) && last_ran_cpu_mask != curr_cpu_mask) {
        return last_ran_cpu_mask;
    }
+
+    /* fall back to picking a cpu out of the affinity mask, preferring something other
+     * than the local cpu.
+     */
+    cpu_mask_t mask = cpu_affinity & ~(curr_cpu_mask);
+    if (mask == 0)
+        return 0; /* local cpu is the only choice */
+
+    return rand_cpu(mask);
 }

 /* run queue manipulation */
-static void insert_in_run_queue_head(thread_t* t) {
+static void insert_in_run_queue_head(cpu_num_t cpu, thread_t* t) {
    DEBUG_ASSERT(!list_in_list(&t->queue_node));

    int ep = effec_priority(t);

-    list_add_head(&run_queue[ep], &t->queue_node);
-    run_queue_bitmap |= (1u << ep);
+    list_add_head(&percpu[cpu].run_queue[ep], &t->queue_node);
+    percpu[cpu].run_queue_bitmap |= (1u << ep);
+    mp_set_cpu_busy(cpu);
 }

-static void insert_in_run_queue_tail(thread_t* t) {
+static void insert_in_run_queue_tail(cpu_num_t cpu, thread_t* t) {
    DEBUG_ASSERT(!list_in_list(&t->queue_node));

    int ep = effec_priority(t);

-    list_add_tail(&run_queue[ep], &t->queue_node);
-    run_queue_bitmap |= (1u << ep);
+    list_add_tail(&percpu[cpu].run_queue[ep], &t->queue_node);
+    percpu[cpu].run_queue_bitmap |= (1u << ep);
+    mp_set_cpu_busy(cpu);
 }

-static thread_t* sched_get_top_thread(uint cpu) {
-    thread_t* newthread;
-    uint32_t local_run_queue_bitmap = run_queue_bitmap;
+static thread_t* sched_get_top_thread(cpu_num_t cpu) {
+    /* pop the head of the head of the highest priority queue with any threads
+     * queued up on the passed in cpu.
+     */
+    struct percpu* c = &percpu[cpu];
+    if (likely(c->run_queue_bitmap)) {
+        uint highest_queue = HIGHEST_PRIORITY - __builtin_clz(c->run_queue_bitmap) - (sizeof(c->run_queue_bitmap) * CHAR_BIT - NUM_PRIORITIES);

-    while (local_run_queue_bitmap) {
-        /* find the first (remaining) queue with a thread in it */
-        uint next_queue = HIGHEST_PRIORITY - __builtin_clz(local_run_queue_bitmap) - (sizeof(run_queue_bitmap) * CHAR_BIT - NUM_PRIORITIES);
+        thread_t* newthread = list_remove_head_type(&c->run_queue[highest_queue], thread_t, queue_node);

-        list_for_every_entry (&run_queue[next_queue], newthread, thread_t, queue_node) {
-            if (likely(newthread->pinned_cpu < 0) || (uint)newthread->pinned_cpu == cpu) {
-                list_delete(&newthread->queue_node);
+        DEBUG_ASSERT(newthread);
+        DEBUG_ASSERT_MSG(newthread->cpu_affinity & cpu_num_to_mask(cpu),
+                         "thread %p name %s, aff %#x cpu %u\n", newthread, newthread->name,
+                         newthread->cpu_affinity, cpu);
+        DEBUG_ASSERT(newthread->curr_cpu == cpu);

-                if (list_is_empty(&run_queue[next_queue]))
-                    run_queue_bitmap &= ~(1 << next_queue);
+        if (list_is_empty(&c->run_queue[highest_queue]))
+            c->run_queue_bitmap &= ~(1u << highest_queue);

-                LOCAL_KTRACE2("sched_get_top", newthread->priority_boost, newthread->base_priority);
+        LOCAL_KTRACE2("sched_get_top", newthread->priority_boost, newthread->base_priority);

-                return newthread;
-            }
-        }
-
-        local_run_queue_bitmap &= ~(1 << next_queue);
+        return newthread;
    }
+
    /* no threads to run, select the idle thread for this cpu */
-    return &percpu[cpu].idle_thread;
+    return &c->idle_thread;
 }

 void sched_block(void) {
@@ -232,7 +247,27 @@ void sched_block(void) {
    sched_resched_internal();
 }

-void sched_unblock(thread_t* t) {
+/* find a cpu to run the thread on, put it in the run queue for that cpu, and accumulate a list
+ * of cpus we'll need to reschedule, including the local cpu.
+ */
+static void find_cpu_and_insert(thread_t* t, bool* local_resched, cpu_mask_t *accum_cpu_mask) {
+    /* find a core to run it on */
+    cpu_mask_t cpu = find_cpu_mask(t);
+    cpu_num_t cpu_num;
+
+    if (cpu == 0) {
+        cpu_num = arch_curr_cpu_num();
+        *local_resched = true;
+    } else {
+        cpu_num = lowest_cpu_set(cpu);
+        *accum_cpu_mask |= cpu_num_to_mask(cpu_num);
+    }
+
+    t->curr_cpu = cpu_num;
+    insert_in_run_queue_head(cpu_num, t);
+}
+
+bool sched_unblock(thread_t* t) {
    DEBUG_ASSERT(spin_lock_held(&thread_lock));

    DEBUG_ASSERT(t->magic == THREAD_MAGIC);
@@ -244,18 +279,25 @@ void sched_unblock(thread_t* t) {

    /* stuff the new thread in the run queue */
    t->state = THREAD_READY;
-    insert_in_run_queue_head(t);

-    mp_reschedule(MP_IPI_TARGET_MASK, find_cpu(t), 0);
+    bool local_resched = false;
+    cpu_mask_t mask = 0;
+    find_cpu_and_insert(t, &local_resched, &mask);
+
+    if (mask)
+        mp_reschedule(MP_IPI_TARGET_MASK, mask, 0);
+    return local_resched;
 }

-void sched_unblock_list(struct list_node* list) {
+bool sched_unblock_list(struct list_node* list) {
    DEBUG_ASSERT(list);
    DEBUG_ASSERT(spin_lock_held(&thread_lock));

    LOCAL_KTRACE0("sched_unblock_list");

    /* pop the list of threads and shove into the scheduler */
+    bool local_resched = false;
+    cpu_mask_t accum_cpu_mask = 0;
    thread_t* t;
    while ((t = list_remove_tail_type(list, thread_t, queue_node))) {
        DEBUG_ASSERT(t->magic == THREAD_MAGIC);
@@ -266,12 +308,32 @@ void sched_unblock_list(struct list_node* list) {

        /* stuff the new thread in the run queue */
        t->state = THREAD_READY;
-        insert_in_run_queue_head(t);
-
-        mp_reschedule(MP_IPI_TARGET_MASK, find_cpu(t), 0);
+        find_cpu_and_insert(t, &local_resched, &accum_cpu_mask);
    }
+
+    if (accum_cpu_mask)
+        mp_reschedule(MP_IPI_TARGET_MASK, accum_cpu_mask, 0);
+
+    return local_resched;
 }

+/* handle the special case of resuming a newly created idle thread */
+void sched_unblock_idle(thread_t* t) {
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+    DEBUG_ASSERT(thread_is_idle(t));
+    DEBUG_ASSERT(t->cpu_affinity && (t->cpu_affinity & (t->cpu_affinity - 1)) == 0);
+
+    /* idle thread is special case, just jam it into the cpu's run queue in the thread's
+     * affinity mask and mark it ready.
+     */
+    t->state = THREAD_READY;
+    cpu_num_t cpu = lowest_cpu_set(t->cpu_affinity);
+    t->curr_cpu = cpu;
+    insert_in_run_queue_head(cpu, t);
+}
+
+/* the thread is voluntarily giving up its time slice */
 void sched_yield(void) {
    DEBUG_ASSERT(spin_lock_held(&thread_lock));

@@ -282,10 +344,10 @@ void sched_yield(void) {

    current_thread->state = THREAD_READY;

-    /* consume the rest of the time slice, deboost ourself, and go to the end of the queue */
+    /* consume the rest of the time slice, deboost ourself, and go to the end of a queue */
    current_thread->remaining_time_slice = 0;
    deboost_thread(current_thread, false);
-    insert_in_run_queue_tail(current_thread);
+    insert_in_run_queue_tail(arch_curr_cpu_num(), current_thread);

    sched_resched_internal();
 }
@@ -295,7 +357,10 @@ void sched_preempt(void) {
    DEBUG_ASSERT(spin_lock_held(&thread_lock));

    thread_t* current_thread = get_current_thread();
+    uint curr_cpu = arch_curr_cpu_num();

+    DEBUG_ASSERT(current_thread->curr_cpu == curr_cpu);
+    DEBUG_ASSERT(current_thread->last_cpu == current_thread->curr_cpu);
    LOCAL_KTRACE0("sched_preempt");

    current_thread->state = THREAD_READY;
@@ -303,11 +368,11 @@ void sched_preempt(void) {
    /* idle thread doesn't go in the run queue */
    if (likely(!thread_is_idle(current_thread))) {
        if (current_thread->remaining_time_slice > 0) {
-            insert_in_run_queue_head(current_thread);
+            insert_in_run_queue_head(curr_cpu, current_thread);
        } else {
-            /* if we're out of quantum, deboost the thread and put it at the tail of the queue */
+            /* if we're out of quantum, deboost the thread and put it at the tail of a queue */
            deboost_thread(current_thread, true);
-            insert_in_run_queue_tail(current_thread);
+            insert_in_run_queue_tail(curr_cpu, current_thread);
        }
    }

@@ -319,7 +384,10 @@ void sched_reschedule(void) {
    DEBUG_ASSERT(spin_lock_held(&thread_lock));

    thread_t* current_thread = get_current_thread();
+    uint curr_cpu = arch_curr_cpu_num();

+    DEBUG_ASSERT(current_thread->curr_cpu == curr_cpu);
+    DEBUG_ASSERT(current_thread->last_cpu == current_thread->curr_cpu);
    LOCAL_KTRACE0("sched_reschedule");

    current_thread->state = THREAD_READY;
@@ -331,15 +399,64 @@ void sched_reschedule(void) {
        deboost_thread(current_thread, false);

        if (current_thread->remaining_time_slice > 0) {
-            insert_in_run_queue_head(current_thread);
+            insert_in_run_queue_head(curr_cpu, current_thread);
        } else {
-            insert_in_run_queue_tail(current_thread);
+            insert_in_run_queue_tail(curr_cpu, current_thread);
        }
    }

    sched_resched_internal();
 }

+/* potentially migrate a thread to a new core based on the affinity mask on the thread. If it's
+ * running or in a scheduler queue, handle it.
+ */
+void sched_migrate(thread_t *t) {
+    DEBUG_ASSERT(spin_lock_held(&thread_lock));
+
+    bool local_resched = false;
+    cpu_mask_t accum_cpu_mask = 0;
+    if (t->state == THREAD_RUNNING) {
+        // see if we need to migrate
+        if (t->cpu_affinity & cpu_num_to_mask(t->curr_cpu)) {
+            // it's running and the new mask contains the core it's already running on, nothing to do.
+            return;
+        }
+
+        // we need to migrate
+        if (t == get_current_thread()) {
+            // current thread, so just shove ourself into another cpu's queue and reschedule locally
+            t->state = THREAD_READY;
+            find_cpu_and_insert(t, &local_resched, &accum_cpu_mask);
+        } else {
+            // running on another cpu, interrupt and let sched_preempt() sort it out
+            accum_cpu_mask = cpu_num_to_mask(t->curr_cpu);
+        }
+    } else if (t->state == THREAD_READY) {
+        if (t->cpu_affinity & cpu_num_to_mask(t->curr_cpu)) {
+            // it's ready and the new mask contains the core it's already waiting on, nothing to do.
+            return;
+        }
+
+        // it's sitting in a run queue somewhere, so pull it out of that one and find a new home
+        list_delete(&t->queue_node);
+
+        struct percpu* c = &percpu[t->curr_cpu];
+        int pri = effec_priority(t);
+        if (list_is_empty(&c->run_queue[pri])) {
+            c->run_queue_bitmap &= ~(1u << pri);
+        }
+
+        find_cpu_and_insert(t, &local_resched, &accum_cpu_mask);
+    }
+
+    // send some ipis and/or locally reschedule based on the previous code
+    if (accum_cpu_mask)
+        mp_reschedule(MP_IPI_TARGET_MASK, accum_cpu_mask, 0);
+    if (local_resched)
+        sched_resched_internal();
+}
+
 /* preemption timer that is set whenever a thread is scheduled */
 static enum handler_return sched_timer_tick(struct timer* t, lk_time_t now, void* arg) {
    /* if the preemption timer went off on the idle or a real time thread, ignore it */
@@ -429,13 +546,14 @@ void sched_resched_internal(void) {
    newthread->last_started_running = now;

    /* mark the cpu ownership of the threads */
-    thread_set_last_cpu(newthread, cpu);
+    if (oldthread->state != THREAD_READY)
+        oldthread->curr_cpu = INVALID_CPU;
+    newthread->last_cpu = cpu;
+    newthread->curr_cpu = cpu;

    /* set the cpu state based on the new thread we've picked */
    if (thread_is_idle(newthread)) {
        mp_set_cpu_idle(cpu);
-    } else {
-        mp_set_cpu_busy(cpu);
    }

    if (thread_is_realtime(newthread)) {
@@ -523,6 +641,7 @@ void sched_resched_internal(void) {

 void sched_init_early(void) {
    /* initialize the run queues */
-    for (unsigned int i = 0; i < NUM_PRIORITIES; i++)
-        list_initialize(&run_queue[i]);
+    for (unsigned int cpu = 0; cpu < SMP_MAX_CPUS; cpu++)
+        for (unsigned int i = 0; i < NUM_PRIORITIES; i++)
+            list_initialize(&percpu[cpu].run_queue[i]);
 }
@@ -48,11 +48,11 @@ spin_lock_t thread_lock = SPIN_LOCK_INITIAL_VALUE;
 static int idle_thread_routine(void*) __NO_RETURN;
 static void thread_exit_locked(thread_t* current_thread, int retcode) __NO_RETURN;
 static void thread_do_suspend(void);
+static status_t thread_unblock_from_wait_queue(thread_t* t, status_t wait_queue_error, bool* local_resched);

 static void init_thread_struct(thread_t* t, const char* name) {
    memset(t, 0, sizeof(thread_t));
    t->magic = THREAD_MAGIC;
-    thread_set_pinned_cpu(t, -1);
    strlcpy(t->name, name, sizeof(t->name));
    wait_queue_init(&t->retcode_wait_queue);
 }
@@ -127,7 +127,9 @@ thread_t* thread_create_etc(
    t->blocking_wait_queue = NULL;
    t->blocked_status = MX_OK;
    t->interruptable = false;
-    thread_set_last_cpu(t, 0);
+    t->curr_cpu = INVALID_CPU;
+    t->last_cpu = INVALID_CPU;
+    t->cpu_affinity = CPU_MASK_ALL;

    t->retcode = 0;
    wait_queue_init(&t->retcode_wait_queue);
@@ -270,8 +272,8 @@ status_t thread_resume(thread_t* t) {
    t->signals &= ~THREAD_SIGNAL_SUSPEND;

    if (t->state == THREAD_INITIAL || t->state == THREAD_SUSPENDED) {
-        sched_unblock(t);
-        if (resched)
+        bool local_resched = sched_unblock(t);
+        if (resched && local_resched)
            sched_reschedule();
    }

@@ -301,6 +303,7 @@ status_t thread_suspend(thread_t* t) {

    THREAD_LOCK(state);

+    bool local_resched = false;
    switch (t->state) {
    case THREAD_INITIAL:
    case THREAD_DEATH:
@@ -315,7 +318,7 @@ status_t thread_suspend(thread_t* t) {
        /* The following call is not essential.  It just makes the
             * thread suspension happen sooner rather than at the next
             * timer interrupt or syscall. */
-        mp_reschedule(MP_IPI_TARGET_MASK, 1u << thread_last_cpu(t), 0);
+        mp_reschedule(MP_IPI_TARGET_MASK, cpu_num_to_mask(t->curr_cpu), 0);
        break;
    case THREAD_SUSPENDED:
        /* thread is suspended already */
@@ -323,20 +326,23 @@ status_t thread_suspend(thread_t* t) {
    case THREAD_BLOCKED:
        /* thread is blocked on something and marked interruptable */
        if (t->interruptable)
-            thread_unblock_from_wait_queue(t, MX_ERR_INTERNAL_INTR_RETRY);
+            thread_unblock_from_wait_queue(t, MX_ERR_INTERNAL_INTR_RETRY, &local_resched);
        break;
    case THREAD_SLEEPING:
        /* thread is sleeping */
        if (t->interruptable) {
            t->blocked_status = MX_ERR_INTERNAL_INTR_RETRY;

-            sched_unblock(t);
+            local_resched = sched_unblock(t);
        }
        break;
    }

    t->signals |= THREAD_SIGNAL_SUSPEND;

+    if (local_resched)
+        sched_reschedule();
+
    THREAD_UNLOCK(state);

    return MX_OK;
@@ -540,6 +546,7 @@ void thread_kill(thread_t* t, bool block) {

    /* general logic is to wake up the thread so it notices it had a signal delivered to it */

+    bool local_resched = false;
    switch (t->state) {
    case THREAD_INITIAL:
        /* thread hasn't been started yet.
@@ -558,23 +565,23 @@ void thread_kill(thread_t* t, bool block) {
        /* The following call is not essential.  It just makes the
             * thread termination happen sooner rather than at the next
             * timer interrupt or syscall. */
-        mp_reschedule(MP_IPI_TARGET_MASK, 1u << thread_last_cpu(t), 0);
+        mp_reschedule(MP_IPI_TARGET_MASK, cpu_num_to_mask(t->curr_cpu), 0);
        break;
    case THREAD_SUSPENDED:
        /* thread is suspended, resume it so it can get the kill signal */
-        sched_unblock(t);
+        local_resched = sched_unblock(t);
        break;
    case THREAD_BLOCKED:
        /* thread is blocked on something and marked interruptable */
        if (t->interruptable)
-            thread_unblock_from_wait_queue(t, MX_ERR_INTERNAL_INTR_KILLED);
+            thread_unblock_from_wait_queue(t, MX_ERR_INTERNAL_INTR_KILLED, &local_resched);
        break;
    case THREAD_SLEEPING:
        /* thread is sleeping */
        if (t->interruptable) {
            t->blocked_status = MX_ERR_INTERNAL_INTR_KILLED;

-            sched_unblock(t);
+            local_resched = sched_unblock(t);
        }
        break;
    case THREAD_DEATH:
@@ -585,25 +592,39 @@ void thread_kill(thread_t* t, bool block) {
    /* wait for the thread to exit */
    if (block && !(t->flags & THREAD_FLAG_DETACHED)) {
        wait_queue_block(&t->retcode_wait_queue, INFINITE_TIME);
+    } else if (local_resched) {
+        sched_reschedule();
    }

 done:
    THREAD_UNLOCK(state);
 }

-/* Migrates the current thread to the CPU identified by target_cpuid. */
-void thread_migrate_cpu(const uint target_cpuid) {
-    thread_t* self = get_current_thread();
-    thread_set_pinned_cpu(self, target_cpuid);
+// Sets the cpu affinity mask of a thread to the passed in mask and migrate
+// the thread if active.
+void thread_set_cpu_affinity(thread_t* t, cpu_mask_t affinity) {
+    DEBUG_ASSERT(t->magic == THREAD_MAGIC);

-    mp_reschedule(MP_IPI_TARGET_MASK, 1u << target_cpuid, 0);
+    THREAD_LOCK(state);

-    // When we return from this call, we should have migrated to the target cpu
-    thread_yield();
+    // make sure the passed in mask is valid and at least one cpu can run the thread
+    if ((affinity & mp_get_online_mask()) == 0) {
+        goto done;
+    }

-    // Make sure that we have actually migrated.
-    const uint current_cpu_id = thread_last_cpu(self);
-    DEBUG_ASSERT(current_cpu_id == target_cpuid);
+    // set the affinity mask
+    t->cpu_affinity = affinity;
+
+    // let the scheduler deal with it
+    sched_migrate(t);
+
+done:
+    THREAD_UNLOCK(state);
+}
+
+/* Migrates the current thread to the CPU identified by target_cpu. */
+void thread_migrate_cpu(const cpu_num_t target_cpu) {
+    thread_set_cpu_affinity(get_current_thread(), cpu_num_to_mask(target_cpu));
 }

 // thread_lock must be held when calling this function.  This function will
@@ -792,11 +813,11 @@ static enum handler_return thread_sleep_handler(timer_t* timer, lk_time_t now, v

    t->blocked_status = MX_OK;

-    sched_unblock(t);
+    bool local_resched = sched_unblock(t);

    spin_unlock(&thread_lock);

-    return INT_RESCHEDULE;
+    return local_resched ? INT_RESCHEDULE : INT_NO_RESCHEDULE;
 }

 #define MIN_SLEEP_SLACK LK_USEC(1)
@@ -906,9 +927,8 @@ lk_time_t thread_runtime(const thread_t* t) {
 */
 void thread_construct_first(thread_t* t, const char* name) {
    DEBUG_ASSERT(arch_ints_disabled());
-    /* Due to somethings below being macros, this might be unused on
-     * non-SMP builds */
-    __UNUSED uint cpu = arch_curr_cpu_num();
+
+    cpu_num_t cpu = arch_curr_cpu_num();

    init_thread_struct(t, name);
    t->base_priority = HIGHEST_PRIORITY;
@@ -916,8 +936,9 @@ void thread_construct_first(thread_t* t, const char* name) {
    t->state = THREAD_RUNNING;
    t->flags = THREAD_FLAG_DETACHED;
    t->signals = 0;
-    thread_set_last_cpu(t, cpu);
-    thread_set_pinned_cpu(t, cpu);
+    t->curr_cpu = cpu;
+    t->last_cpu = cpu;
+    t->cpu_affinity = cpu_num_to_mask(cpu);

    arch_thread_construct_first(t);

@@ -1012,7 +1033,10 @@ void thread_become_idle(void) {
    t->base_priority = IDLE_PRIORITY;
    t->priority_boost = 0;
    t->flags |= THREAD_FLAG_IDLE;
-    thread_set_pinned_cpu(t, arch_curr_cpu_num());
+    cpu_num_t curr_cpu = arch_curr_cpu_num();
+    t->last_cpu = curr_cpu;
+    t->curr_cpu = curr_cpu;
+    t->cpu_affinity = cpu_num_to_mask(curr_cpu);

    mp_set_curr_cpu_active(true);
    mp_set_cpu_idle(arch_curr_cpu_num());
@@ -1047,7 +1071,7 @@ void thread_secondary_cpu_entry(void) {
 /**
 * @brief Create an idle thread for a secondary CPU
 */
-thread_t* thread_create_idle_thread(uint cpu_num) {
+thread_t* thread_create_idle_thread(cpu_num_t cpu_num) {
    DEBUG_ASSERT(cpu_num != 0 && cpu_num < SMP_MAX_CPUS);

    /* Shouldn't be initialized yet */
@@ -1065,8 +1089,13 @@ thread_t* thread_create_idle_thread(uint cpu_num) {
    if (t == NULL) {
        return t;
    }
-    t->flags |= THREAD_FLAG_IDLE;
-    thread_set_pinned_cpu(t, cpu_num);
+    t->flags |= THREAD_FLAG_IDLE | THREAD_FLAG_DETACHED;
+    t->cpu_affinity = cpu_num_to_mask(cpu_num);
+
+    THREAD_LOCK(state);
+    sched_unblock_idle(t);
+    THREAD_UNLOCK(state);
+
    return t;
 }

@@ -1123,9 +1152,9 @@ void dump_thread(thread_t* t, bool full_dump) {

    if (full_dump) {
        dprintf(INFO, "dump_thread: t %p (%s:%s)\n", t, oname, t->name);
-        dprintf(INFO, "\tstate %s, last_cpu %u, pinned_cpu %d, priority %d:%d, "
+        dprintf(INFO, "\tstate %s, curr/last cpu %d/%d, cpu_affinity %#x, priority %d:%d, "
                      "remaining time slice %" PRIu64 "\n",
-                thread_state_to_str(t->state), t->last_cpu, t->pinned_cpu, t->base_priority,
+                thread_state_to_str(t->state), (int)t->curr_cpu, (int)t->last_cpu, t->cpu_affinity, t->base_priority,
                t->priority_boost, t->remaining_time_slice);
        dprintf(INFO, "\truntime_ns %" PRIu64 ", runtime_s %" PRIu64 "\n",
                runtime, runtime / 1000000000);
@@ -1209,9 +1238,10 @@ static enum handler_return wait_queue_timeout_handler(timer_t* timer, lk_time_t
    if (timer_trylock_or_cancel(timer, &thread_lock))
        return INT_NO_RESCHEDULE;

+    bool local_resched;
    enum handler_return ret = INT_NO_RESCHEDULE;
-    if (thread_unblock_from_wait_queue(thread, MX_ERR_TIMED_OUT) >= MX_OK) {
-        ret = INT_RESCHEDULE;
+    if (thread_unblock_from_wait_queue(thread, MX_ERR_TIMED_OUT, &local_resched) >= MX_OK) {
+        ret = local_resched ? INT_RESCHEDULE : INT_NO_RESCHEDULE;
    }

    spin_unlock(&thread_lock);
@@ -1309,8 +1339,8 @@ int wait_queue_wake_one(wait_queue_t* wait, bool reschedule, status_t wait_queue
        t->blocked_status = wait_queue_error;
        t->blocking_wait_queue = NULL;

-        sched_unblock(t);
-        if (reschedule)
+        bool local_resched = sched_unblock(t);
+        if (reschedule && local_resched)
            sched_reschedule();

        ret = 1;
@@ -1380,8 +1410,8 @@ int wait_queue_wake_all(wait_queue_t* wait, bool reschedule, status_t wait_queue
    DEBUG_ASSERT(ret > 0);
    DEBUG_ASSERT(wait->count == 0);

-    sched_unblock_list(&list);
-    if (reschedule)
+    bool local_resched = sched_unblock_list(&list);
+    if (reschedule && local_resched)
        sched_reschedule();

    return ret;
@@ -1421,12 +1451,12 @@ void wait_queue_destroy(wait_queue_t* wait) {
 * puts it at the head of the run queue.
 *
 * @param t  The thread to wake
- * @param wait_queue_error  The return value which the new thread will receive
- *   from wait_queue_block().
+ * @param wait_queue_error  The return value which the new thread will receive from wait_queue_block().
+ * @param local_resched  Returns if the caller should reschedule locally.
 *
 * @return MX_ERR_BAD_STATE if thread was not in any wait queue.
 */
-status_t thread_unblock_from_wait_queue(thread_t* t, status_t wait_queue_error) {
+static status_t thread_unblock_from_wait_queue(thread_t* t, status_t wait_queue_error, bool* local_resched) {
    DEBUG_ASSERT(t->magic == THREAD_MAGIC);
    DEBUG_ASSERT(arch_ints_disabled());
    DEBUG_ASSERT(spin_lock_held(&thread_lock));
@@ -1443,7 +1473,7 @@ status_t thread_unblock_from_wait_queue(thread_t* t, status_t wait_queue_error)
    t->blocking_wait_queue = NULL;
    t->blocked_status = wait_queue_error;

-    sched_unblock(t);
+    *local_resched = sched_unblock(t);

    return MX_OK;
 }
@@ -129,6 +129,8 @@ mx_status_t dlog_write(uint32_t flags, const void* ptr, size_t len) {
    }
    log->head += wiresize;

+    spin_unlock_irqrestore(&log->lock, state);
+
    // if we happen to be called from within the global thread lock, use a
    // special version of event signal
    if (spin_lock_holder_cpu(&thread_lock) == arch_curr_cpu_num()) {
@@ -137,8 +139,6 @@ mx_status_t dlog_write(uint32_t flags, const void* ptr, size_t len) {
        event_signal(&log->event, false);
    }

-    spin_unlock_irqrestore(&log->lock, state);
-
    return MX_OK;
 }

@@ -304,7 +304,7 @@ void platform_halt_cpu(void) {
    status_t result;
    park_cpu park = (park_cpu)KERNEL_SPIN_OFFSET;
    thread_t *self = get_current_thread();
-    const uint cpuid = thread_last_cpu(self);
+    const cpu_num_t cpuid = self->last_cpu;

    fbl::AutoLock lock(&cpu_halt_lock);
    // If we're the first CPU to halt then we need to create an address space to
@@ -42,7 +42,5 @@ MODULE_DEPS += \
 KERNEL_DEFINES += \
    PLATFORM_SUPPORTS_PANIC_SHELL=1

-SMP_MAX_CPUS ?= 8
-
 include make/module.mk

@@ -77,7 +77,7 @@ void lk_main(void)
    // create a thread to complete system initialization
    dprintf(SPEW, "creating bootstrap completion thread\n");
    thread_t *t = thread_create("bootstrap2", &bootstrap2, NULL, DEFAULT_PRIORITY, DEFAULT_STACK_SIZE);
-    thread_set_pinned_cpu(t, 0);
+    thread_set_cpu_affinity(t, cpu_num_to_mask(0));
    thread_detach(t);
    thread_resume(t);

@@ -142,7 +142,6 @@ void lk_init_secondary_cpus(uint secondary_cpu_count)
            secondary_idle_thread_count = i;
            break;
        }
-        thread_detach_and_resume(t);
    }
    secondary_idle_thread_count = secondary_cpu_count;
 }
Autor	SHA1	Mensagem	Data
Travis Geiselbrecht	8143c0420b	WIP moving cpu affinity to a mask Change-Id: I5dfc9f8e1c23d63ed749c84bc3dfa4118470b8e6	2017-09-07 14:33:09 -07:00
Travis Geiselbrecht	9a441b7ada	[kernel][mp] add new header with types and routines to deal with cpu numbers Add a few more types and switch some apis to using those. No functional change. Change-Id: I67add1247cf36d9e6a55f15dd809ffe4bafe06fd	2017-09-07 14:33:08 -07:00
Travis Geiselbrecht	c784a87fd8	WIP [kernel][sched] per cpu run queues Change-Id: Ia3f30995b3c258d4278e34e28553a548a2be0ae3	2017-09-07 14:30:48 -07:00
Travis Geiselbrecht	2be63ed7ba	[kernel][pc] bump SMP_MAX_CPUS on the PC platform to 16 Was set to 8 before, but there are enough >8 machines around that it's worth at least trying to run without artificially clamping the number of cpus. Change-Id: I027ed0a997d34d6e68c899a9531154ea67ad07ca	2017-09-07 14:30:48 -07:00