From 0a08f5135945ba1ccd1c4171428cba1a311036f4 Mon Sep 17 00:00:00 2001
From: Axel Heider <axelheider@gmx.de>
Date: Fri, 28 Jun 2024 15:41:33 +0200
Subject: [PATCH] boot/smp: use local mask to detect core start

Use a local mask variable concurrently to detect secondary core start
instead of incrementing ksNumCPUs. This give the compiler full control
over the concurrent accesses, ksNumCPUs is just set at the end when all
cores are up.

Signed-off-by: Axel Heider <axelheider@gmx.de>
---
 src/arch/arm/kernel/boot.c   | 93 +++++++++++++++++++++++++++++-------
 src/arch/riscv/kernel/boot.c | 78 ++++++++++++++++++++++++++----
 2 files changed, 145 insertions(+), 26 deletions(-)

diff --git a/src/arch/arm/kernel/boot.c b/src/arch/arm/kernel/boot.c
index 9974323be8..a3f6d68c81 100644
--- a/src/arch/arm/kernel/boot.c
+++ b/src/arch/arm/kernel/boot.c
@@ -35,6 +35,7 @@
  * then set it to 1.
  */
 BOOT_BSS static int node_boot_lock;
+BOOT_BSS static word_t node_boot_mask;
 #endif /* ENABLE_SMP_SUPPORT */
 
 BOOT_BSS static region_t reserved[NUM_RESERVED_REGIONS];
@@ -261,14 +262,28 @@ BOOT_CODE static void init_plat(void)
 }
 
 #ifdef ENABLE_SMP_SUPPORT
-BOOT_CODE static bool_t try_init_kernel_secondary_core(void)
+
+BOOT_CODE static void update_smp_lock(word_t new_val)
 {
-    /* Busy wait for primary node to release secondary nodes. Using C11 atomics
-     * guarantees proper visibility across threads and cores.
-     */
-    while (0 == __atomic_load_n(&node_boot_lock, __ATOMIC_ACQUIRE)) {
-        /* keep spinning */
+    assert(node_boot_lock < new_val);
+    __atomic_store_n(&node_boot_lock, new_val, __ATOMIC_RELEASE);
+}
+
+BOOT_CODE static void wait_for_smp_lock_update(word_t new_val)
+{
+    for(;;) {
+        word_t v = __atomic_load_n(&node_boot_lock, __ATOMIC_ACQUIRE);
+        if (v == new_val) {
+            return;
+        }
+        assert(v < new_val);
     }
+}
+
+BOOT_CODE static bool_t try_init_kernel_secondary_core(word_t core_id)
+{
+    /* Busy wait for primary node to release secondary nodes. */
+    wait_for_smp_lock_update(1);
 
     /* Perform cpu init */
     init_cpu();
@@ -284,12 +299,21 @@ BOOT_CODE static bool_t try_init_kernel_secondary_core(void)
     setIRQState(IRQReserved, CORE_IRQ_TO_IRQT(getCurrentCPUIndex(), INTERRUPT_VGIC_MAINTENANCE));
     setIRQState(IRQReserved, CORE_IRQ_TO_IRQT(getCurrentCPUIndex(), INTERRUPT_VTIMER_EVENT));
 #endif /* CONFIG_ARM_HYPERVISOR_SUPPORT */
-    NODE_LOCK_SYS;
 
+    /* intialize kernel on secondary nodes */
+    NODE_LOCK_SYS;
+    printf("core #%d init\n", core_id);
     clock_sync_test();
-    (void)__atomic_fetch_add(&ksNumCPUs, 1, __ATOMIC_ACQ_REL);
-
     init_core_state(SchedulerAction_ResumeCurrentThread);
+    NODE_UNLOCK;
+    /* Set BIT(core_id - 1) in node_boot_mask to tell primary node that init on
+     * this node is done
+     */
+    (void)__atomic_fetch_or(&node_boot_mask, BIT(core_id - 1), __ATOMIC_ACQ_REL);
+
+     /* Busy wait (again) for primary node to ack SMP init and release secondary
+      * the nodes. */
+    wait_for_smp_lock_update(2);
 
     return true;
 }
@@ -299,8 +323,7 @@ BOOT_CODE static void release_secondary_cpus(void)
     /* Release all nodes at the same time. Update the lock in a way that ensures
      * the result is visible everywhere.
      */
-    assert(0 == node_boot_lock); /* Sanity check for a proper lock state. */
-    __atomic_store_n(&node_boot_lock, 1, __ATOMIC_RELEASE);
+    update_smp_lock(1);
 
     /*
      * At this point in time the primary core (executing this code) already uses
@@ -319,8 +342,14 @@ BOOT_CODE static void release_secondary_cpus(void)
     plat_cleanInvalidateL2Cache();
 #endif
 
-    /* Wait until all the secondary cores are done initialising */
-    while (CONFIG_MAX_NUM_NODES != __atomic_load_n(&ksNumCPUs, __ATOMIC_ACQUIRE)) {
+    /* Wait until all the secondary cores are done initialising, ie. the bit for
+     * each core is set. Each core has a bit, so missing bits in the mask
+     * indicate which core failed to start.
+     */
+    word_t missing_nodes = BIT(CONFIG_MAX_NUM_NODES - 1) - 1;
+    word_t ready_order[CONFIG_MAX_NUM_NODES - 1][2] = {0};
+    int idx = 0;
+    do {
 #ifdef ENABLE_SMP_CLOCK_SYNC_TEST_ON_BOOT
         /* Secondary cores compare their time with our primary core's time, so
          * keep updating the timestamp while spinning.
@@ -328,8 +357,31 @@ BOOT_CODE static void release_secondary_cpus(void)
         NODE_STATE(ksCurTime) = getCurrentTime();
         __atomic_thread_fence(__ATOMIC_RELEASE); /* ensure write propagates */
 #endif
+        word_t mask = __atomic_load_n(&node_boot_mask, __ATOMIC_ACQUIRE);
+        // mask = 0100
+        word_t new_cores_ready = missing_nodes & mask;
+        // new_cores_ready = 1110 & 0100 = 0100
+        while (new_cores_ready > 0) {
+            unsigned int core_bit = wordBits - 1 - clzl(new_cores_ready);
+            new_cores_ready &= ~BIT(core_bit);
+            ready_order[idx][0] = timestamp();
+            ready_order[idx][1] = core_bit;
+            idx++;
+        }
+        missing_nodes &= ~mask;
+    } while ( != missing_nodes > 0);
+
+    for (int i = 0; i < ARRAY_SIZE(ready_order); i++) {
+        printf("[%"SEL4_PRIu_word"] core #%d up\n",
+               ready_order[i][0], (int)ready_order[i][1] + 1);
     }
 }
+
+BOOT_CODE static void release_secondary_cores_to_userland(void)
+{
+    update_smp_lock(2);
+}
+
 #endif /* ENABLE_SMP_SUPPORT */
 
 /* Main kernel initialisation function. */
@@ -622,16 +674,20 @@ static BOOT_CODE bool_t try_init_kernel(
     SMP_COND_STATEMENT(clh_lock_init());
     SMP_COND_STATEMENT(release_secondary_cpus());
 
-    /* All cores are up now, so there can be concurrency. The kernel booting is
+    /* All cores have finished booting now and wait to be released again to exit
+     * to userland. Grabing the BKL no is not really needed as there is no
+     * concurrency here. . needed for SMK init finalization  are up now, so there can be concurrency. The kernel booting is
      * supposed to be finished before the secondary cores are released, all the
      * primary has to do now is schedule the initial thread. Currently there is
      * nothing that touches any global data structures, nevertheless we grab the
      * BKL here to play safe. It is released when the kernel is left. */
     NODE_LOCK_SYS;
-
+    ksNumCPUs = CONFIG_MAX_NUM_NODES;
     printf("Booting all finished, dropped to user space\n");
+    NODE_UNLOCK;
+
+    SMP_COND_STATEMENT(release_secondary_cores_to_userland());
 
-    /* kernel successfully initialized */
     return true;
 }
 
@@ -648,14 +704,15 @@ BOOT_CODE VISIBLE void init_kernel(
 
 #ifdef ENABLE_SMP_SUPPORT
     /* we assume there exists a cpu with id 0 and will use it for bootstrapping */
-    if (getCurrentCPUIndex() == 0) {
+    word_t core_id = getCurrentCPUIndex();
+    if (core_id == 0) {
         result = try_init_kernel(ui_p_reg_start,
                                  ui_p_reg_end,
                                  pv_offset,
                                  v_entry,
                                  dtb_addr_p, dtb_size);
     } else {
-        result = try_init_kernel_secondary_core();
+        result = try_init_kernel_secondary_core(core_id);
     }
 
 #else
diff --git a/src/arch/riscv/kernel/boot.c b/src/arch/riscv/kernel/boot.c
index 3144fb4c62..852bbafdcc 100644
--- a/src/arch/riscv/kernel/boot.c
+++ b/src/arch/riscv/kernel/boot.c
@@ -28,6 +28,7 @@
  * then set it to 1.
  */
 BOOT_BSS static int node_boot_lock;
+BOOT_BSS static word_t node_boot_mask;
 #endif /* ENABLE_SMP_SUPPORT */
 
 BOOT_BSS static region_t res_reg[NUM_RESERVED_REGIONS];
@@ -151,24 +152,47 @@ BOOT_CODE static void init_plat(void)
 
 
 #ifdef ENABLE_SMP_SUPPORT
+
+BOOT_CODE static void wait_for_lock_update(word_t new_val)
+{
+    for(;;) {
+        word_t v = __atomic_load_n(&node_boot_lock, __ATOMIC_ACQUIRE);
+        if (v == new_val) {
+            return;
+        }
+        assert(v < new_val);
+    }
+}
+
 BOOT_CODE static bool_t try_init_kernel_secondary_core(word_t hart_id, word_t core_id)
 {
     /* Busy wait for primary node to release secondary nodes. Using C11 atomics
      * guarantees proper visibility across threads and cores.
      */
-    while (0 == __atomic_load_n(&node_boot_lock, __ATOMIC_ACQUIRE)) {
-        /* keep spinning */
-    }
+    wait_for_lock_update(1);
 
     fence_r_rw(); /* ToDo: is this still necessary? */
 
     init_cpu();
+
+    /* intialize kernel on secondary nodes */
     NODE_LOCK_SYS;
 
     clock_sync_test();
-    (void)__atomic_fetch_add(&ksNumCPUs, 1, __ATOMIC_ACQ_REL);
+    printf("core #%d (hart_id #%d) init\n", (int)core_id, (int)hart_id);
     init_core_state(SchedulerAction_ResumeCurrentThread);
+    NODE_UNLOCK;
+    /* Set BIT(core_id - 1) in node_boot_mask to tell primary node that init on
+     * this node is done
+     */
+    (void)__atomic_fetch_or(&node_boot_mask, BIT(core_id - 1), __ATOMIC_ACQ_REL);
+
     ifence_local();
+
+     /* Busy wait (again) for primary node to ack SMP init and release secondary
+      * the nodes. */
+    wait_for_lock_update(2);
+
     return true;
 }
 
@@ -195,8 +219,14 @@ BOOT_CODE static void release_secondary_cores(void)
      */
     fence_rw_rw();
 
-    /* Keep spinning until all secondary nodes are up */
-    while (CONFIG_MAX_NUM_NODES != __atomic_load_n(&ksNumCPUs, __ATOMIC_ACQUIRE)) {
+    /* Wait until all the secondary cores are done initialising, ie the bit for
+     * each core is set. Each core has a bit, so missing bits in the mask
+     * indicate which core failed to start.
+     */
+    word_t missing_nodes = BIT(CONFIG_MAX_NUM_NODES - 1) - 1;
+    word_t ready_order[CONFIG_MAX_NUM_NODES - 1][2] = {0};
+    int idx = 0;
+    do {
 #ifdef ENABLE_SMP_CLOCK_SYNC_TEST_ON_BOOT
         /* Secondary cores compare their time with our primary core's time, so
          * keep updating the timestamp while spinning.
@@ -204,8 +234,34 @@ BOOT_CODE static void release_secondary_cores(void)
         NODE_STATE(ksCurTime) = getCurrentTime();
         __atomic_thread_fence(__ATOMIC_RELEASE); /* ensure write propagates */
 #endif
+
+        word_t mask = __atomic_load_n(&node_boot_mask, __ATOMIC_ACQUIRE);
+        // mask = 0100
+        word_t new_cores_ready = missing_nodes & mask;
+        // new_cores_ready = 1110 & 0100 = 0100
+        while (new_cores_ready > 0) {
+            unsigned int core_bit = wordBits - 1 - clzl(new_cores_ready);
+            new_cores_ready &= ~BIT(core_bit);
+            ready_order[idx][0] = riscv_read_cycle();
+            ready_order[idx][1] = core_bit;
+            idx++;
+        }
+        missing_nodes &= ~mask;
+    } while ( != missing_nodes > 0);
+
+    for (int i = 0; i < ARRAY_SIZE(ready_order); i++) {
+        printf("[%"SEL4_PRIu_word"] core #%d up\n",
+               ready_order[i][0], (int)ready_order[i][1] + 1);
     }
 }
+
+
+BOOT_CODE static void release_secondary_cores_to_userland(void)
+{
+    assert(1 == node_boot_lock);
+    __atomic_store_n(&node_boot_lock, 2, __ATOMIC_RELEASE);
+}
+
 #endif /* ENABLE_SMP_SUPPORT */
 
 /* Main kernel initialisation function. */
@@ -463,14 +519,20 @@ static BOOT_CODE bool_t try_init_kernel(
     SMP_COND_STATEMENT(clh_lock_init());
     SMP_COND_STATEMENT(release_secondary_cores());
 
-    /* All cores are up now, so there can be concurrency. The kernel booting is
+    /* All cores have finished booting now and wait to be released again to exit
+     * to userland. Grabing the BKL no is not really needed as there is no
+     * concurrency here. . needed for SMK init finalization  are up now, so there can be concurrency. The kernel booting is
      * supposed to be finished before the secondary cores are released, all the
      * primary has to do now is schedule the initial thread. Currently there is
      * nothing that touches any global data structures, nevertheless we grab the
      * BKL here to play safe. It is released when the kernel is left. */
     NODE_LOCK_SYS;
-
+    ksNumCPUs = CONFIG_MAX_NUM_NODES;
     printf("Booting all finished, dropped to user space\n");
+    NODE_UNLOCK;
+
+    SMP_COND_STATEMENT(release_secondary_cores_to_userland());
+
     return true;
 }