From 9232766e604888e9b8ccda403249a3eeb2c7776e Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 13:06:38 -0700 Subject: [PATCH 01/17] feat(firmware): add NVS-backed event log ring buffer Persistent 32-slot ring buffer of tagged diagnostic events (boot, wifi up/down, http ok/fail, heartbeat miss, reboot). Used to diagnose field failures post-hoc via the heartbeat payload, without needing serial access. Native-native stub lets policy be unit-tested. --- firmware/lib/event_log/event_log.cpp | 109 ++++++++++++++++++ firmware/lib/event_log/event_log.h | 39 +++++++ .../test/test_event_log/test_event_log.cpp | 56 +++++++++ 3 files changed, 204 insertions(+) create mode 100644 firmware/lib/event_log/event_log.cpp create mode 100644 firmware/lib/event_log/event_log.h create mode 100644 firmware/test/test_event_log/test_event_log.cpp diff --git a/firmware/lib/event_log/event_log.cpp b/firmware/lib/event_log/event_log.cpp new file mode 100644 index 0000000..f5c19a8 --- /dev/null +++ b/firmware/lib/event_log/event_log.cpp @@ -0,0 +1,109 @@ +// firmware/lib/event_log/event_log.cpp +#include "event_log.h" +#include +#include + +#ifdef ARDUINO + #include + #include + #include + static Preferences s_prefs; + static const char* NVS_NS = "evlog"; + static const char* NVS_HEAD = "head"; // next write slot (0..31) + static const char* NVS_CNT = "cnt"; // total writes (for seq) +#else + // Native build: in-memory stub + #include + static uint8_t g_slots[32 * 32]; + static uint32_t g_head = 0; + static uint32_t g_cnt = 0; + extern "C" void event_log_test_reset() { + memset(g_slots, 0, sizeof(g_slots)); + g_head = 0; + g_cnt = 0; + } +#endif + +static const size_t SLOTS = 32; +static const size_t SLOT_SIZE = sizeof(EventLogEntry); + +uint16_t event_log_path_hash(const char* path) { + // fnv1a-16 (fold 32-bit fnv1a down to 16 bits) + uint32_t h = 0x811c9dc5u; + while (*path) { h ^= (uint8_t)*path++; h *= 0x01000193u; } + return (uint16_t)((h >> 16) ^ (h & 0xFFFF)); +} + +static void slot_write(size_t idx, const EventLogEntry& e) { +#ifdef ARDUINO + char key[8]; snprintf(key, sizeof(key), "s%u", (unsigned)idx); + s_prefs.putBytes(key, &e, SLOT_SIZE); +#else + memcpy(&g_slots[idx * SLOT_SIZE], &e, SLOT_SIZE); +#endif +} + +static bool slot_read(size_t idx, EventLogEntry& e) { +#ifdef ARDUINO + char key[8]; snprintf(key, sizeof(key), "s%u", (unsigned)idx); + size_t n = s_prefs.getBytes(key, &e, SLOT_SIZE); + return n == SLOT_SIZE; +#else + memcpy(&e, &g_slots[idx * SLOT_SIZE], SLOT_SIZE); + return true; +#endif +} + +void event_log_init() { +#ifdef ARDUINO + s_prefs.begin(NVS_NS, /*readOnly=*/false); +#else + // nothing +#endif +} + +void event_log_write(EventLogTag tag, uint16_t data0, uint16_t data1) { + EventLogEntry e = {}; +#ifdef ARDUINO + time_t now = time(nullptr); + e.ts_unix = (now > 1700000000) ? (uint32_t)now : 0; + e.uptime_s = (uint32_t)(millis() / 1000); + uint32_t head = s_prefs.getUInt(NVS_HEAD, 0); + uint32_t cnt = s_prefs.getUInt(NVS_CNT, 0); +#else + e.ts_unix = 0; + e.uptime_s = g_cnt; // stand-in for uptime in native tests + uint32_t head = g_head; + uint32_t cnt = g_cnt; +#endif + e.tag = (uint8_t)tag; + e.data0 = data0; + e.data1 = data1; + e.seq = (uint8_t)(cnt & 0xFF); + slot_write(head % SLOTS, e); +#ifdef ARDUINO + s_prefs.putUInt(NVS_HEAD, (head + 1) % SLOTS); + s_prefs.putUInt(NVS_CNT, cnt + 1); +#else + g_head = (head + 1) % SLOTS; + g_cnt = cnt + 1; +#endif +} + +size_t event_log_read_recent(EventLogEntry* out, size_t max_entries) { +#ifdef ARDUINO + uint32_t head = s_prefs.getUInt(NVS_HEAD, 0); + uint32_t cnt = s_prefs.getUInt(NVS_CNT, 0); +#else + uint32_t head = g_head; + uint32_t cnt = g_cnt; +#endif + size_t available = (cnt < SLOTS) ? (size_t)cnt : SLOTS; + size_t n = (max_entries < available) ? max_entries : available; + for (size_t i = 0; i < n; i++) { + // newest is at (head - 1), then (head - 2), ... modulo SLOTS + size_t idx = (head + SLOTS - 1 - i) % SLOTS; + slot_read(idx, out[i]); + } + return n; +} diff --git a/firmware/lib/event_log/event_log.h b/firmware/lib/event_log/event_log.h new file mode 100644 index 0000000..de210ed --- /dev/null +++ b/firmware/lib/event_log/event_log.h @@ -0,0 +1,39 @@ +// firmware/lib/event_log/event_log.h +#pragma once +#include +#include + +enum EventLogTag : uint8_t { + EVT_BOOT = 1, // data0 = esp_reset_reason() value + EVT_WIFI_UP = 2, // data0 = rssi (signed, cast) + EVT_WIFI_DOWN = 3, // data0 = disconnect reason code + EVT_HTTP_OK = 4, // data0 = path hash (fnv1a16), data1 = elapsed_ms + EVT_HTTP_FAIL = 5, // data0 = path hash, data1 = (http_code or negative errno) + EVT_HEARTBEAT_MISS = 6, // data0 = consecutive miss count + EVT_NTP_SYNC = 7, // data0 = seconds since boot + EVT_REBOOT = 8, // data0 = reason enum (defined below) +}; + +enum RebootReason : uint8_t { + REBOOT_HEARTBEAT_MISS = 1, + REBOOT_FACTORY_RESET = 2, + REBOOT_OTA = 3, + REBOOT_WIFI_REPROV = 4, +}; + +struct EventLogEntry { + uint32_t ts_unix; // 0 if NTP not synced yet; fall back to millis/1000 + uint32_t uptime_s; // millis()/1000 at log time + uint16_t data0; + uint16_t data1; + uint8_t tag; // EventLogTag + uint8_t seq; // rolling sequence, wraps + uint8_t _pad[18]; // pad to 32 bytes for fixed slot size +} __attribute__((packed)); +static_assert(sizeof(EventLogEntry) == 32, "EventLogEntry must be 32 bytes"); + +// NVS-backed 32-slot ring buffer. Safe to call before NTP sync. +void event_log_init(); +void event_log_write(EventLogTag tag, uint16_t data0 = 0, uint16_t data1 = 0); +size_t event_log_read_recent(EventLogEntry* out, size_t max_entries); +uint16_t event_log_path_hash(const char* path); // fnv1a16 — exposed for tests diff --git a/firmware/test/test_event_log/test_event_log.cpp b/firmware/test/test_event_log/test_event_log.cpp new file mode 100644 index 0000000..1b2562f --- /dev/null +++ b/firmware/test/test_event_log/test_event_log.cpp @@ -0,0 +1,56 @@ +// firmware/test/test_native/test_event_log.cpp +#include +#include +#include "event_log.h" + +// --- Native NVS stub (declared in event_log.cpp for native builds) --- +extern "C" void event_log_test_reset(); + +void setUp() { event_log_test_reset(); } +void tearDown() {} + +void test_entry_is_32_bytes() { + TEST_ASSERT_EQUAL(32, sizeof(EventLogEntry)); +} + +void test_path_hash_is_stable_and_differs() { + uint16_t a = event_log_path_hash("/api/v1/heartbeat"); + uint16_t b = event_log_path_hash("/api/v1/heartbeat"); + uint16_t c = event_log_path_hash("/api/v1/camera/events/batch"); + TEST_ASSERT_EQUAL(a, b); + TEST_ASSERT_NOT_EQUAL(a, c); +} + +void test_write_then_read_recent_returns_newest_first() { + event_log_init(); + event_log_write(EVT_BOOT, 1, 0); + event_log_write(EVT_WIFI_UP, 2, 0); + event_log_write(EVT_HTTP_FAIL, 3, 500); + EventLogEntry buf[8]; + size_t n = event_log_read_recent(buf, 8); + TEST_ASSERT_EQUAL(3, n); + TEST_ASSERT_EQUAL(EVT_HTTP_FAIL, buf[0].tag); + TEST_ASSERT_EQUAL(500, buf[0].data1); + TEST_ASSERT_EQUAL(EVT_WIFI_UP, buf[1].tag); + TEST_ASSERT_EQUAL(EVT_BOOT, buf[2].tag); +} + +void test_ring_buffer_wraps_after_32_entries() { + event_log_init(); + for (int i = 0; i < 40; i++) event_log_write(EVT_HTTP_OK, (uint16_t)i, 0); + EventLogEntry buf[32]; + size_t n = event_log_read_recent(buf, 32); + TEST_ASSERT_EQUAL(32, n); + // Newest first: data0 should be 39, 38, 37, ... down to 8 + TEST_ASSERT_EQUAL(39, buf[0].data0); + TEST_ASSERT_EQUAL(8, buf[31].data0); +} + +int main() { + UNITY_BEGIN(); + RUN_TEST(test_entry_is_32_bytes); + RUN_TEST(test_path_hash_is_stable_and_differs); + RUN_TEST(test_write_then_read_recent_returns_newest_first); + RUN_TEST(test_ring_buffer_wraps_after_32_entries); + return UNITY_END(); +} From 95f91d365612c902c6bb584977536637e082950c Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 13:13:21 -0700 Subject: [PATCH 02/17] fix(firmware): event_log thread safety and NVS wear - Remove monotonic counter writes to NVS (stop burning flash on every event). Derive head and cnt by scanning slots on boot. - Widen seq to uint32 so slot scan works across multi-year lifetimes. - Add FreeRTOS mutex around write/read so WiFi event handlers can safely call event_log_write from another task. - Check Preferences.begin() return; disable logging if NVS unavailable. - Extract NTP_SYNC_THRESHOLD constant; drop misleading native uptime. - Add tests for empty read, max_entries truncation, real-path hash. --- firmware/lib/event_log/event_log.cpp | 86 +++++++++++++------ firmware/lib/event_log/event_log.h | 4 +- .../test/test_event_log/test_event_log.cpp | 31 +++++++ 3 files changed, 95 insertions(+), 26 deletions(-) diff --git a/firmware/lib/event_log/event_log.cpp b/firmware/lib/event_log/event_log.cpp index f5c19a8..3766935 100644 --- a/firmware/lib/event_log/event_log.cpp +++ b/firmware/lib/event_log/event_log.cpp @@ -7,10 +7,15 @@ #include #include #include + #include + #include static Preferences s_prefs; static const char* NVS_NS = "evlog"; - static const char* NVS_HEAD = "head"; // next write slot (0..31) - static const char* NVS_CNT = "cnt"; // total writes (for seq) + static bool s_ok = false; + static SemaphoreHandle_t s_mutex = nullptr; + static uint32_t g_head = 0; // next write slot (0..31), RAM-only + static uint32_t g_cnt = 0; // total writes since boot scan, RAM-only + static constexpr time_t NTP_SYNC_THRESHOLD = 1700000000; // 2023-11-14 #else // Native build: in-memory stub #include @@ -56,48 +61,78 @@ static bool slot_read(size_t idx, EventLogEntry& e) { void event_log_init() { #ifdef ARDUINO - s_prefs.begin(NVS_NS, /*readOnly=*/false); + if (s_mutex == nullptr) { + s_mutex = xSemaphoreCreateMutex(); + } + s_ok = s_prefs.begin(NVS_NS, /*readOnly=*/false); + if (!s_ok) { + Serial.println("[evlog] NVS begin failed"); + return; + } + // Scan all 32 slots; locate the one with the largest seq. + // Empty log: every slot tag == 0 (not a valid EventLogTag, which starts at 1). + uint32_t max_seq = 0; + int max_idx = -1; + bool any_valid = false; + for (size_t i = 0; i < SLOTS; i++) { + EventLogEntry e = {}; + if (!slot_read(i, e)) continue; + if (e.tag == 0) continue; + any_valid = true; + if (max_idx < 0 || e.seq >= max_seq) { + max_seq = e.seq; + max_idx = (int)i; + } + } + if (any_valid) { + g_head = (uint32_t)((max_idx + 1) % SLOTS); + g_cnt = max_seq + 1; + } else { + g_head = 0; + g_cnt = 0; + } #else // nothing #endif } void event_log_write(EventLogTag tag, uint16_t data0, uint16_t data1) { - EventLogEntry e = {}; #ifdef ARDUINO + if (!s_ok) return; + if (s_mutex) xSemaphoreTake(s_mutex, portMAX_DELAY); + EventLogEntry e = {}; time_t now = time(nullptr); - e.ts_unix = (now > 1700000000) ? (uint32_t)now : 0; + e.ts_unix = (now > NTP_SYNC_THRESHOLD) ? (uint32_t)now : 0; e.uptime_s = (uint32_t)(millis() / 1000); - uint32_t head = s_prefs.getUInt(NVS_HEAD, 0); - uint32_t cnt = s_prefs.getUInt(NVS_CNT, 0); -#else - e.ts_unix = 0; - e.uptime_s = g_cnt; // stand-in for uptime in native tests - uint32_t head = g_head; - uint32_t cnt = g_cnt; -#endif e.tag = (uint8_t)tag; e.data0 = data0; e.data1 = data1; - e.seq = (uint8_t)(cnt & 0xFF); - slot_write(head % SLOTS, e); -#ifdef ARDUINO - s_prefs.putUInt(NVS_HEAD, (head + 1) % SLOTS); - s_prefs.putUInt(NVS_CNT, cnt + 1); + e.seq = g_cnt; + slot_write(g_head % SLOTS, e); + g_head = (g_head + 1) % SLOTS; + g_cnt = g_cnt + 1; + if (s_mutex) xSemaphoreGive(s_mutex); #else - g_head = (head + 1) % SLOTS; - g_cnt = cnt + 1; + EventLogEntry e = {}; + e.ts_unix = 0; + e.uptime_s = 0; + e.tag = (uint8_t)tag; + e.data0 = data0; + e.data1 = data1; + e.seq = g_cnt; + slot_write(g_head % SLOTS, e); + g_head = (g_head + 1) % SLOTS; + g_cnt = g_cnt + 1; #endif } size_t event_log_read_recent(EventLogEntry* out, size_t max_entries) { #ifdef ARDUINO - uint32_t head = s_prefs.getUInt(NVS_HEAD, 0); - uint32_t cnt = s_prefs.getUInt(NVS_CNT, 0); -#else + if (!s_ok) return 0; + if (s_mutex) xSemaphoreTake(s_mutex, portMAX_DELAY); +#endif uint32_t head = g_head; uint32_t cnt = g_cnt; -#endif size_t available = (cnt < SLOTS) ? (size_t)cnt : SLOTS; size_t n = (max_entries < available) ? max_entries : available; for (size_t i = 0; i < n; i++) { @@ -105,5 +140,8 @@ size_t event_log_read_recent(EventLogEntry* out, size_t max_entries) { size_t idx = (head + SLOTS - 1 - i) % SLOTS; slot_read(idx, out[i]); } +#ifdef ARDUINO + if (s_mutex) xSemaphoreGive(s_mutex); +#endif return n; } diff --git a/firmware/lib/event_log/event_log.h b/firmware/lib/event_log/event_log.h index de210ed..d4ff60f 100644 --- a/firmware/lib/event_log/event_log.h +++ b/firmware/lib/event_log/event_log.h @@ -27,8 +27,8 @@ struct EventLogEntry { uint16_t data0; uint16_t data1; uint8_t tag; // EventLogTag - uint8_t seq; // rolling sequence, wraps - uint8_t _pad[18]; // pad to 32 bytes for fixed slot size + uint32_t seq; // widened; survives multi-year event rates + uint8_t _pad[15]; // pad to 32 bytes for fixed slot size } __attribute__((packed)); static_assert(sizeof(EventLogEntry) == 32, "EventLogEntry must be 32 bytes"); diff --git a/firmware/test/test_event_log/test_event_log.cpp b/firmware/test/test_event_log/test_event_log.cpp index 1b2562f..aceb47f 100644 --- a/firmware/test/test_event_log/test_event_log.cpp +++ b/firmware/test/test_event_log/test_event_log.cpp @@ -46,11 +46,42 @@ void test_ring_buffer_wraps_after_32_entries() { TEST_ASSERT_EQUAL(8, buf[31].data0); } +void test_empty_log_read_returns_zero() { + event_log_init(); + EventLogEntry buf[8]; + size_t n = event_log_read_recent(buf, 8); + TEST_ASSERT_EQUAL(0, n); +} + +void test_read_recent_truncates_to_max_entries() { + event_log_init(); + for (int i = 0; i < 10; i++) event_log_write(EVT_HTTP_OK, (uint16_t)i, 0); + EventLogEntry buf[3]; + size_t n = event_log_read_recent(buf, 3); + TEST_ASSERT_EQUAL(3, n); + // Newest 3: data0 == 9, 8, 7 + TEST_ASSERT_EQUAL(9, buf[0].data0); + TEST_ASSERT_EQUAL(8, buf[1].data0); + TEST_ASSERT_EQUAL(7, buf[2].data0); +} + +void test_path_hash_distinguishes_real_api_paths() { + uint16_t h1 = event_log_path_hash("/api/v1/heartbeat"); + uint16_t h2 = event_log_path_hash("/api/v1/camera/events/batch"); + uint16_t h3 = event_log_path_hash("/api/v1/events/batch"); + TEST_ASSERT_NOT_EQUAL(h1, h2); + TEST_ASSERT_NOT_EQUAL(h1, h3); + TEST_ASSERT_NOT_EQUAL(h2, h3); +} + int main() { UNITY_BEGIN(); RUN_TEST(test_entry_is_32_bytes); RUN_TEST(test_path_hash_is_stable_and_differs); RUN_TEST(test_write_then_read_recent_returns_newest_first); RUN_TEST(test_ring_buffer_wraps_after_32_entries); + RUN_TEST(test_empty_log_read_returns_zero); + RUN_TEST(test_read_recent_truncates_to_max_entries); + RUN_TEST(test_path_hash_distinguishes_real_api_paths); return UNITY_END(); } From 9eb1e19651557963b2714668a2caf767ff9b6859 Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 13:18:08 -0700 Subject: [PATCH 03/17] =?UTF-8?q?test(firmware):=20event=5Flog=20boot=20re?= =?UTF-8?q?covery=20=E2=80=94=20partial=20fill=20and=20post-wrap?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Exercises the slot-scan logic in event_log_init(): after a simulated reboot (RAM state cleared, NVS slots preserved) the module must resume with the correct head/cnt so newest-first read order is unchanged and subsequent writes continue the seq monotonically. Adds native-only event_log_test_simulate_reboot() helper. Lifts the slot-scan loop out of the #ifdef ARDUINO guard so the native stub exercises the same recovery path as production; the platform-specific NVS setup remains guarded. --- firmware/lib/event_log/event_log.cpp | 9 ++-- firmware/lib/event_log/event_log.h | 1 + .../test/test_event_log/test_event_log.cpp | 54 +++++++++++++++++++ 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/firmware/lib/event_log/event_log.cpp b/firmware/lib/event_log/event_log.cpp index 3766935..95fd630 100644 --- a/firmware/lib/event_log/event_log.cpp +++ b/firmware/lib/event_log/event_log.cpp @@ -27,6 +27,11 @@ g_head = 0; g_cnt = 0; } + extern "C" void event_log_test_simulate_reboot() { + // Simulate device reboot: clear in-RAM state, keep persistent slots. + g_head = 0; + g_cnt = 0; + } #endif static const size_t SLOTS = 32; @@ -69,6 +74,7 @@ void event_log_init() { Serial.println("[evlog] NVS begin failed"); return; } +#endif // Scan all 32 slots; locate the one with the largest seq. // Empty log: every slot tag == 0 (not a valid EventLogTag, which starts at 1). uint32_t max_seq = 0; @@ -91,9 +97,6 @@ void event_log_init() { g_head = 0; g_cnt = 0; } -#else - // nothing -#endif } void event_log_write(EventLogTag tag, uint16_t data0, uint16_t data1) { diff --git a/firmware/lib/event_log/event_log.h b/firmware/lib/event_log/event_log.h index d4ff60f..0782743 100644 --- a/firmware/lib/event_log/event_log.h +++ b/firmware/lib/event_log/event_log.h @@ -33,6 +33,7 @@ struct EventLogEntry { static_assert(sizeof(EventLogEntry) == 32, "EventLogEntry must be 32 bytes"); // NVS-backed 32-slot ring buffer. Safe to call before NTP sync. +// Call exactly once from application setup, before any task writes events. void event_log_init(); void event_log_write(EventLogTag tag, uint16_t data0 = 0, uint16_t data1 = 0); size_t event_log_read_recent(EventLogEntry* out, size_t max_entries); diff --git a/firmware/test/test_event_log/test_event_log.cpp b/firmware/test/test_event_log/test_event_log.cpp index aceb47f..5742b2e 100644 --- a/firmware/test/test_event_log/test_event_log.cpp +++ b/firmware/test/test_event_log/test_event_log.cpp @@ -74,6 +74,58 @@ void test_path_hash_distinguishes_real_api_paths() { TEST_ASSERT_NOT_EQUAL(h2, h3); } +extern "C" void event_log_test_simulate_reboot(); + +void test_boot_recovery_after_partial_fill() { + // Phase 1: write 5 entries before "reboot" + event_log_init(); + for (uint16_t i = 0; i < 5; i++) event_log_write(EVT_HTTP_OK, i, 0); + + // Phase 2: simulate reboot (clear RAM state, keep slots), re-init, verify + event_log_test_simulate_reboot(); + event_log_init(); + + // All 5 original entries should still be readable, newest first + EventLogEntry buf[8]; + size_t n = event_log_read_recent(buf, 8); + TEST_ASSERT_EQUAL(5, n); + TEST_ASSERT_EQUAL(4, buf[0].data0); // newest + TEST_ASSERT_EQUAL(0, buf[4].data0); // oldest + + // Phase 3: write one more — seq must continue (not restart at 0), + // so the new entry is the newest and slot index 5 holds it + event_log_write(EVT_HTTP_OK, 99, 0); + n = event_log_read_recent(buf, 8); + TEST_ASSERT_EQUAL(6, n); + TEST_ASSERT_EQUAL(99, buf[0].data0); + TEST_ASSERT_EQUAL(4, buf[1].data0); +} + +void test_boot_recovery_after_wrap() { + // Phase 1: write 40 entries (wraps the 32-slot ring once; oldest 8 dropped) + event_log_init(); + for (uint16_t i = 0; i < 40; i++) event_log_write(EVT_HTTP_OK, i, 0); + + // Phase 2: simulate reboot, re-init + event_log_test_simulate_reboot(); + event_log_init(); + + // Still 32 entries visible, newest=39, oldest=8 + EventLogEntry buf[32]; + size_t n = event_log_read_recent(buf, 32); + TEST_ASSERT_EQUAL(32, n); + TEST_ASSERT_EQUAL(39, buf[0].data0); + TEST_ASSERT_EQUAL(8, buf[31].data0); + + // Phase 3: one more write — newest becomes 100, head advances past + // wherever the max-seq slot was, oldest drops to data0=9 + event_log_write(EVT_HTTP_OK, 100, 0); + n = event_log_read_recent(buf, 32); + TEST_ASSERT_EQUAL(32, n); + TEST_ASSERT_EQUAL(100, buf[0].data0); + TEST_ASSERT_EQUAL(9, buf[31].data0); +} + int main() { UNITY_BEGIN(); RUN_TEST(test_entry_is_32_bytes); @@ -83,5 +135,7 @@ int main() { RUN_TEST(test_empty_log_read_returns_zero); RUN_TEST(test_read_recent_truncates_to_max_entries); RUN_TEST(test_path_hash_distinguishes_real_api_paths); + RUN_TEST(test_boot_recovery_after_partial_fill); + RUN_TEST(test_boot_recovery_after_wrap); return UNITY_END(); } From 95724bf3ff2178832140f9b3b216ee7d360ed721 Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 13:21:23 -0700 Subject: [PATCH 04/17] feat(firmware): log boot and reboot reason to event log Every boot logs EVT_BOOT with esp_reset_reason(); every deliberate ESP.restart() is preceded by EVT_REBOOT with a reason code. This gives us a persistent answer to 'why did the device just reboot?'. --- firmware/src/main.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp index 91faca3..475bae5 100644 --- a/firmware/src/main.cpp +++ b/firmware/src/main.cpp @@ -8,6 +8,8 @@ #include "cv.h" #include "ble_scanner.h" #include "reporter.h" +#include "event_log.h" +#include // LED on GPIO2 (TimerCamera-F built-in LED) — verify against board schematic // Factory reset: hold GPIO37 (BOOT button) for 5 seconds @@ -45,6 +47,7 @@ static void check_factory_reset() { uint32_t held = millis(); while (digitalRead(BUTTON_PIN) == LOW) { if (millis() - held >= FACTORY_RESET_HOLD_MS) { + event_log_write(EVT_REBOOT, REBOOT_FACTORY_RESET, 0); config_clear_wifi(); ESP.restart(); } @@ -140,6 +143,9 @@ void setup() { pinMode(BUTTON_PIN, INPUT_PULLUP); led_set(true); // on = booting + event_log_init(); + event_log_write(EVT_BOOT, (uint16_t)esp_reset_reason(), 0); + if (!config_load(g_cfg)) { Serial.println("FATAL: device_id/location_id/hmac_secret not provisioned"); while (true) { delay(500); led_set(!digitalRead(LED_PIN)); } // fast blink @@ -148,6 +154,7 @@ void setup() { // Connect to WiFi if (!config_has_wifi()) { provisioning_run(); + event_log_write(EVT_REBOOT, REBOOT_WIFI_REPROV, 0); ESP.restart(); } @@ -161,6 +168,7 @@ void setup() { if (WiFi.status() != WL_CONNECTED) { // Saved creds failed — re-provision provisioning_run(); + event_log_write(EVT_REBOOT, REBOOT_WIFI_REPROV, 0); ESP.restart(); } @@ -183,7 +191,11 @@ void setup() { // OTA update support ArduinoOTA.setHostname(g_cfg.device_id.c_str()); ArduinoOTA.onStart([]() { ble_scanner_pause(); }); - ArduinoOTA.onEnd([]() { ble_scanner_resume(); ESP.restart(); }); + ArduinoOTA.onEnd([]() { + ble_scanner_resume(); + event_log_write(EVT_REBOOT, REBOOT_OTA, 0); + ESP.restart(); + }); ArduinoOTA.onError([](ota_error_t e) { ble_scanner_resume(); }); ArduinoOTA.begin(); From 9f293b46399b3688f56ec90fbabd27362a611c40 Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 13:26:10 -0700 Subject: [PATCH 05/17] feat(firmware): event-driven WiFi reconnect with exponential backoff net_guard registers WiFi.onEvent() so disconnects are handled immediately instead of polled every 1s. Backoff 1s->2s->4s->...->60s cap. Every up/down transition is logged to the event log with the disconnect reason code, so field failures are diagnosable. --- firmware/lib/net_guard/net_guard.cpp | 55 +++++++++++++++++++ firmware/lib/net_guard/net_guard.h | 24 ++++++++ .../test/test_net_guard/test_net_guard.cpp | 32 +++++++++++ 3 files changed, 111 insertions(+) create mode 100644 firmware/lib/net_guard/net_guard.cpp create mode 100644 firmware/lib/net_guard/net_guard.h create mode 100644 firmware/test/test_net_guard/test_net_guard.cpp diff --git a/firmware/lib/net_guard/net_guard.cpp b/firmware/lib/net_guard/net_guard.cpp new file mode 100644 index 0000000..df59d8f --- /dev/null +++ b/firmware/lib/net_guard/net_guard.cpp @@ -0,0 +1,55 @@ +// firmware/lib/net_guard/net_guard.cpp +#include "net_guard.h" + +uint32_t net_guard_next_backoff_ms(uint32_t attempt) { + if (attempt >= 6) return 60000; + return 1000u * (1u << attempt); +} + +#ifdef ARDUINO +#include +#include "event_log.h" + +static const DeviceConfig* s_cfg = nullptr; +static volatile uint8_t s_last_disconnect = 0; +static volatile bool s_up = false; +static volatile uint32_t s_attempts = 0; +static volatile uint32_t s_next_retry_ms = 0; + +static void on_wifi_event(WiFiEvent_t event, WiFiEventInfo_t info) { + switch (event) { + case ARDUINO_EVENT_WIFI_STA_GOT_IP: + s_up = true; + s_attempts = 0; + s_next_retry_ms = 0; + event_log_write(EVT_WIFI_UP, (uint16_t)(int16_t)WiFi.RSSI(), 0); + break; + case ARDUINO_EVENT_WIFI_STA_DISCONNECTED: + s_up = false; + s_last_disconnect = (uint8_t)info.wifi_sta_disconnected.reason; + event_log_write(EVT_WIFI_DOWN, s_last_disconnect, 0); + s_next_retry_ms = millis() + net_guard_next_backoff_ms(s_attempts); + break; + default: break; + } +} + +void net_guard_start(const DeviceConfig& cfg) { + s_cfg = &cfg; + WiFi.onEvent(on_wifi_event); + WiFi.setAutoReconnect(false); // we drive reconnect ourselves +} + +bool net_guard_is_up() { return s_up; } + +uint8_t net_guard_last_disconnect_reason() { return s_last_disconnect; } + +extern "C" void net_guard_tick() { + if (s_up || s_cfg == nullptr) return; + if (millis() < s_next_retry_ms) return; + s_attempts++; + WiFi.disconnect(false, false); + WiFi.begin(s_cfg->wifi_ssid.c_str(), s_cfg->wifi_pass.c_str()); + s_next_retry_ms = millis() + net_guard_next_backoff_ms(s_attempts); +} +#endif diff --git a/firmware/lib/net_guard/net_guard.h b/firmware/lib/net_guard/net_guard.h new file mode 100644 index 0000000..915b483 --- /dev/null +++ b/firmware/lib/net_guard/net_guard.h @@ -0,0 +1,24 @@ +// firmware/lib/net_guard/net_guard.h +#pragma once +#include + +// Exponential backoff: 1s, 2s, 4s, 8s, 16s, 32s, 60s, 60s, ... +// attempt 0 -> 1000ms, clamped at 60000ms. +uint32_t net_guard_next_backoff_ms(uint32_t attempt); + +#ifdef ARDUINO +#include "config.h" + +// Registers WiFi.onEvent() handler and starts auto-reconnect loop. +// Must be called once after WiFi.begin() succeeds. +void net_guard_start(const DeviceConfig& cfg); + +// True iff WiFi is currently associated with IP. +bool net_guard_is_up(); + +// Last disconnect reason code from WIFI_EVENT_STA_DISCONNECTED (0 = none). +uint8_t net_guard_last_disconnect_reason(); + +// Non-blocking tick called from loop(); kicks reconnect if due. +extern "C" void net_guard_tick(); +#endif diff --git a/firmware/test/test_net_guard/test_net_guard.cpp b/firmware/test/test_net_guard/test_net_guard.cpp new file mode 100644 index 0000000..535ac07 --- /dev/null +++ b/firmware/test/test_net_guard/test_net_guard.cpp @@ -0,0 +1,32 @@ +// firmware/test/test_net_guard/test_net_guard.cpp +#include +#include "net_guard.h" + +void setUp() {} +void tearDown() {} + +void test_backoff_starts_at_one_second() { + TEST_ASSERT_EQUAL(1000, net_guard_next_backoff_ms(0)); +} + +void test_backoff_doubles_each_attempt() { + TEST_ASSERT_EQUAL(2000, net_guard_next_backoff_ms(1)); + TEST_ASSERT_EQUAL(4000, net_guard_next_backoff_ms(2)); + TEST_ASSERT_EQUAL(8000, net_guard_next_backoff_ms(3)); + TEST_ASSERT_EQUAL(16000, net_guard_next_backoff_ms(4)); + TEST_ASSERT_EQUAL(32000, net_guard_next_backoff_ms(5)); +} + +void test_backoff_clamps_at_60s() { + TEST_ASSERT_EQUAL(60000, net_guard_next_backoff_ms(6)); + TEST_ASSERT_EQUAL(60000, net_guard_next_backoff_ms(7)); + TEST_ASSERT_EQUAL(60000, net_guard_next_backoff_ms(100)); +} + +int main() { + UNITY_BEGIN(); + RUN_TEST(test_backoff_starts_at_one_second); + RUN_TEST(test_backoff_doubles_each_attempt); + RUN_TEST(test_backoff_clamps_at_60s); + return UNITY_END(); +} From 84d9ba349b17185ed29cb415f4cdaf022437c96b Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 13:31:47 -0700 Subject: [PATCH 06/17] fix(firmware): net_guard boot-state seed + no spurious disconnect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Seed s_up from WiFi.status() in net_guard_start so the first STA_GOT_IP (fired during setup's busy-wait, before onEvent was registered) is not missed — prevents a reconnect flap on every boot. - Drop WiFi.disconnect() from net_guard_tick; WiFi.begin() alone re-associates cleanly and avoids a spurious STA_DISCONNECTED that was double-logging EVT_WIFI_DOWN on every retry. - Re-check s_up after the millis() timing gate to close the GOT_IP-vs-tick race. - Document the volatile-only shared-state contract. --- firmware/lib/net_guard/net_guard.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/firmware/lib/net_guard/net_guard.cpp b/firmware/lib/net_guard/net_guard.cpp index df59d8f..f0e0cf1 100644 --- a/firmware/lib/net_guard/net_guard.cpp +++ b/firmware/lib/net_guard/net_guard.cpp @@ -10,6 +10,9 @@ uint32_t net_guard_next_backoff_ms(uint32_t attempt) { #include #include "event_log.h" +// Shared with the WiFi event task. 32-bit aligned loads/stores are atomic on +// Xtensa; volatile suffices. Tick re-evaluates every loop iteration, so stale +// reads self-correct within ~200ms. static const DeviceConfig* s_cfg = nullptr; static volatile uint8_t s_last_disconnect = 0; static volatile bool s_up = false; @@ -36,6 +39,10 @@ static void on_wifi_event(WiFiEvent_t event, WiFiEventInfo_t info) { void net_guard_start(const DeviceConfig& cfg) { s_cfg = &cfg; + // Seed s_up from the current WiFi state. setup()'s busy-wait on + // WiFi.begin() can produce a STA_GOT_IP before onEvent() is registered; + // without this seed, the first tick would force a spurious reconnect. + if (WiFi.status() == WL_CONNECTED) s_up = true; WiFi.onEvent(on_wifi_event); WiFi.setAutoReconnect(false); // we drive reconnect ourselves } @@ -47,8 +54,11 @@ uint8_t net_guard_last_disconnect_reason() { return s_last_disconnect; } extern "C" void net_guard_tick() { if (s_up || s_cfg == nullptr) return; if (millis() < s_next_retry_ms) return; + if (s_up) return; // re-check after the timing gate — closes GOT_IP-vs-tick race s_attempts++; - WiFi.disconnect(false, false); + // WiFi.begin() alone re-associates cleanly; a prior WiFi.disconnect() call + // synchronously emits STA_DISCONNECTED on the event task, which would + // double-log EVT_WIFI_DOWN (reason=ASSOC_LEAVE) on every retry. WiFi.begin(s_cfg->wifi_ssid.c_str(), s_cfg->wifi_pass.c_str()); s_next_retry_ms = millis() + net_guard_next_backoff_ms(s_attempts); } From cfa0d2563f25710a6efd60e0d968429711150267 Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 13:31:54 -0700 Subject: [PATCH 07/17] fix(firmware): event_log bounded mutex wait, skip on contention Mutex take in event_log_write and event_log_read_recent switched from portMAX_DELAY to pdMS_TO_TICKS(50) with skip-on-timeout. Prevents the high-priority WiFi event task from stalling on NVS writes; diag loss under contention is preferable to dropped WiFi events. --- firmware/lib/event_log/event_log.cpp | 10 ++++++++-- firmware/lib/event_log/event_log.h | 6 ++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/firmware/lib/event_log/event_log.cpp b/firmware/lib/event_log/event_log.cpp index 95fd630..12beed3 100644 --- a/firmware/lib/event_log/event_log.cpp +++ b/firmware/lib/event_log/event_log.cpp @@ -102,7 +102,11 @@ void event_log_init() { void event_log_write(EventLogTag tag, uint16_t data0, uint16_t data1) { #ifdef ARDUINO if (!s_ok) return; - if (s_mutex) xSemaphoreTake(s_mutex, portMAX_DELAY); + // Bounded wait: skip on contention rather than stall the calling task. + // This matters because event_log_write runs from the WiFi event task + // (priority 23); blocking it on a 10-100ms NVS write can overflow the + // event queue. Diagnostic loss is preferable to dropped WiFi events. + if (s_mutex && xSemaphoreTake(s_mutex, pdMS_TO_TICKS(50)) != pdTRUE) return; EventLogEntry e = {}; time_t now = time(nullptr); e.ts_unix = (now > NTP_SYNC_THRESHOLD) ? (uint32_t)now : 0; @@ -132,7 +136,9 @@ void event_log_write(EventLogTag tag, uint16_t data0, uint16_t data1) { size_t event_log_read_recent(EventLogEntry* out, size_t max_entries) { #ifdef ARDUINO if (!s_ok) return 0; - if (s_mutex) xSemaphoreTake(s_mutex, portMAX_DELAY); + // Bounded wait to match event_log_write. Reads are slower (32 NVS gets), + // but returning 0 entries under contention beats blocking the caller. + if (s_mutex && xSemaphoreTake(s_mutex, pdMS_TO_TICKS(50)) != pdTRUE) return 0; #endif uint32_t head = g_head; uint32_t cnt = g_cnt; diff --git a/firmware/lib/event_log/event_log.h b/firmware/lib/event_log/event_log.h index 0782743..470f45a 100644 --- a/firmware/lib/event_log/event_log.h +++ b/firmware/lib/event_log/event_log.h @@ -35,6 +35,12 @@ static_assert(sizeof(EventLogEntry) == 32, "EventLogEntry must be 32 bytes"); // NVS-backed 32-slot ring buffer. Safe to call before NTP sync. // Call exactly once from application setup, before any task writes events. void event_log_init(); + +// Safe to call from any FreeRTOS task after event_log_init(). +// Bounded mutex wait (~50ms) — will silently skip on contention rather than +// block the calling task. Acceptable for diagnostic logging. void event_log_write(EventLogTag tag, uint16_t data0 = 0, uint16_t data1 = 0); + +// Same bounded-wait contract as event_log_write: returns 0 on mutex timeout. size_t event_log_read_recent(EventLogEntry* out, size_t max_entries); uint16_t event_log_path_hash(const char* path); // fnv1a16 — exposed for tests From af3067d4814d80b0265260a958b219a664d7461c Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 13:36:29 -0700 Subject: [PATCH 08/17] refactor(firmware): drive WiFi reconnect from net_guard events loop() no longer blocks for 5s after a disconnect; reconnect is scheduled from the WiFi event handler with exponential backoff. Buffered reports flush on every clean UP transition. --- firmware/lib/net_guard/library.json | 6 ++++++ firmware/src/main.cpp | 19 ++++++++++--------- 2 files changed, 16 insertions(+), 9 deletions(-) create mode 100644 firmware/lib/net_guard/library.json diff --git a/firmware/lib/net_guard/library.json b/firmware/lib/net_guard/library.json new file mode 100644 index 0000000..fe445e3 --- /dev/null +++ b/firmware/lib/net_guard/library.json @@ -0,0 +1,6 @@ +{ + "name": "net_guard", + "build": { + "flags": ["-I$PROJECT_SRC_DIR"] + } +} diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp index 475bae5..4cb595d 100644 --- a/firmware/src/main.cpp +++ b/firmware/src/main.cpp @@ -9,6 +9,7 @@ #include "ble_scanner.h" #include "reporter.h" #include "event_log.h" +#include "net_guard.h" #include // LED on GPIO2 (TimerCamera-F built-in LED) — verify against board schematic @@ -172,6 +173,7 @@ void setup() { ESP.restart(); } + net_guard_start(g_cfg); led_set(false); // off = connected // NTP sync (UTC) @@ -208,15 +210,14 @@ void setup() { void loop() { ArduinoOTA.handle(); check_factory_reset(); + net_guard_tick(); - if (WiFi.status() != WL_CONNECTED) { - led_set(true); // on = no WiFi - WiFi.reconnect(); - delay(5000); - if (WiFi.status() == WL_CONNECTED) { - led_set(false); - reporter_flush(g_cfg); - } + static bool s_was_up = true; + bool up = net_guard_is_up(); + if (up != s_was_up) { + led_set(!up); // LED on when NOT up + if (up) reporter_flush(g_cfg); + s_was_up = up; } - delay(1000); + delay(200); } From 57129ba0785c53ce7e88d839f6d60f4bab7ac9be Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 13:41:53 -0700 Subject: [PATCH 09/17] fix(firmware): net_guard silent-wifi-death fallback + header hygiene - net_guard_tick now detects status-vs-event divergence. If s_up is true but WiFi.status() says otherwise (rare: driver wedge, silent RF failure), force DOWN state and schedule reconnect. Uses 0xFF disconnect reason so the event log distinguishes this path. - Forward-declare DeviceConfig in net_guard.h so consumers that don't call net_guard_start don't transitively pull config.h. --- firmware/lib/net_guard/net_guard.cpp | 10 ++++++++++ firmware/lib/net_guard/net_guard.h | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/firmware/lib/net_guard/net_guard.cpp b/firmware/lib/net_guard/net_guard.cpp index f0e0cf1..36b4f3e 100644 --- a/firmware/lib/net_guard/net_guard.cpp +++ b/firmware/lib/net_guard/net_guard.cpp @@ -7,6 +7,7 @@ uint32_t net_guard_next_backoff_ms(uint32_t attempt) { } #ifdef ARDUINO +#include "config.h" #include #include "event_log.h" @@ -52,6 +53,15 @@ bool net_guard_is_up() { return s_up; } uint8_t net_guard_last_disconnect_reason() { return s_last_disconnect; } extern "C" void net_guard_tick() { + // Watchdog against silent WiFi death: if we think we're up but the radio + // disagrees, force the DOWN state so reconnect scheduling kicks in. + if (s_up && WiFi.status() != WL_CONNECTED) { + s_up = false; + s_last_disconnect = 0xFF; // 0xFF = "silent death, no event" + event_log_write(EVT_WIFI_DOWN, s_last_disconnect, 0); + s_next_retry_ms = millis() + net_guard_next_backoff_ms(s_attempts); + } + if (s_up || s_cfg == nullptr) return; if (millis() < s_next_retry_ms) return; if (s_up) return; // re-check after the timing gate — closes GOT_IP-vs-tick race diff --git a/firmware/lib/net_guard/net_guard.h b/firmware/lib/net_guard/net_guard.h index 915b483..9b9fc66 100644 --- a/firmware/lib/net_guard/net_guard.h +++ b/firmware/lib/net_guard/net_guard.h @@ -7,7 +7,7 @@ uint32_t net_guard_next_backoff_ms(uint32_t attempt); #ifdef ARDUINO -#include "config.h" +struct DeviceConfig; // forward-decl; only net_guard_start needs the full type // Registers WiFi.onEvent() handler and starts auto-reconnect loop. // Must be called once after WiFi.begin() succeeds. From 8f8ad0b1b0da248067e2988946edfb1dbef45ed1 Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 13:44:17 -0700 Subject: [PATCH 10/17] fix(firmware): add HTTP timeouts + 3-try retry, report heartbeat status Unbounded TLS/HTTP POSTs were blocking the reporter task indefinitely on weak WiFi. Now: 5s connect timeout, 10s response timeout, 3 attempts with 0/2s/5s backoff. Every attempt logs HTTP_OK or HTTP_FAIL to the event log. reporter_heartbeat now returns bool so the caller can count consecutive misses. --- firmware/src/reporter.cpp | 38 ++++++++++++++++++++++++++++++-------- firmware/src/reporter.h | 2 +- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/firmware/src/reporter.cpp b/firmware/src/reporter.cpp index a896e71..a980a13 100644 --- a/firmware/src/reporter.cpp +++ b/firmware/src/reporter.cpp @@ -1,6 +1,7 @@ // firmware/src/reporter.cpp #include "reporter.h" #include "hmac.h" +#include "event_log.h" #include #include #include @@ -21,25 +22,46 @@ static uint32_t now_ts() { return (uint32_t)time(nullptr); } -static bool post_json(const DeviceConfig& cfg, const char* path, const String& body) { +static bool post_json_once(const DeviceConfig& cfg, const char* path, const String& body) { uint32_t ts = now_ts(); - // Reject if NTP hasn't synced yet (timestamp would be near epoch 0) - if (ts < 1700000000UL) return false; // pre-2023 → clock not valid + if (ts < 1700000000UL) return false; String sig = hmac_sign(cfg.hmac_secret, "POST", path, ts, body); - if (sig.isEmpty()) return false; // HMAC failed + if (sig.isEmpty()) return false; HTTPClient http; String url = String(REPORTER_API_HOST) + path; http.begin(url); + http.setConnectTimeout(5000); // DNS + TCP connect + http.setTimeout(10000); // per-transaction response timeout http.addHeader("Content-Type", "application/json"); http.addHeader("X-Device-Id", cfg.device_id); http.addHeader("X-Timestamp", String(ts)); http.addHeader("X-Signature", sig); + uint32_t t0 = millis(); int code = http.POST(body); + uint32_t elapsed = millis() - t0; http.end(); - Serial.printf("[HTTP] POST %s → %d\n", url.c_str(), code); - return (code == 200); + uint16_t phash = event_log_path_hash(path); + Serial.printf("[HTTP] POST %s -> %d (%u ms)\n", url.c_str(), code, (unsigned)elapsed); + if (code == 200) { + event_log_write(EVT_HTTP_OK, phash, (uint16_t)((elapsed > 65535) ? 65535 : elapsed)); + return true; + } + event_log_write(EVT_HTTP_FAIL, phash, (uint16_t)code); + return false; +} + +static bool post_json(const DeviceConfig& cfg, const char* path, const String& body) { + // 3 attempts with 0/2s/5s delays. Total worst case ~ 2x(5+10)s + 7s = 37s; + // acceptable for an hourly task and within the 30s TWDT once Task 6 lands + // IF TWDT is fed between attempts (see Task 6). + static const uint16_t DELAYS_MS[] = { 0, 2000, 5000 }; + for (int i = 0; i < 3; i++) { + if (DELAYS_MS[i]) vTaskDelay(pdMS_TO_TICKS(DELAYS_MS[i])); + if (post_json_once(cfg, path, body)) return true; + } + return false; } static String build_camera_batch(const DeviceConfig& cfg, @@ -147,7 +169,7 @@ void reporter_submit_ble(const DeviceConfig& cfg, const BLEHourlyRecord& rec) { } } -void reporter_heartbeat(const DeviceConfig& cfg, uint32_t uptime_s, int wifi_rssi) { +bool reporter_heartbeat(const DeviceConfig& cfg, uint32_t uptime_s, int wifi_rssi) { JsonDocument doc; doc["device_id"] = cfg.device_id; doc["firmware_version"] = "1.0.0"; @@ -156,7 +178,7 @@ void reporter_heartbeat(const DeviceConfig& cfg, uint32_t uptime_s, int wifi_rss doc["pending_records"] = (int)(s_cam_buf.size() + s_ble_buf.size()); doc["uptime_seconds"] = uptime_s; String body; serializeJson(doc, body); - post_json(cfg, "/api/v1/heartbeat", body); + return post_json(cfg, "/api/v1/heartbeat", body); } void reporter_flush(const DeviceConfig& cfg) { diff --git a/firmware/src/reporter.h b/firmware/src/reporter.h index 96e796e..cd3f49e 100644 --- a/firmware/src/reporter.h +++ b/firmware/src/reporter.h @@ -17,5 +17,5 @@ static const char* REPORTER_API_HOST = "http://logs.research.bike"; void reporter_init(); void reporter_submit_camera(const DeviceConfig& cfg, const CameraHourlyRecord& rec); void reporter_submit_ble(const DeviceConfig& cfg, const BLEHourlyRecord& rec); -void reporter_heartbeat(const DeviceConfig& cfg, uint32_t uptime_s, int wifi_rssi); +bool reporter_heartbeat(const DeviceConfig& cfg, uint32_t uptime_s, int wifi_rssi); void reporter_flush(const DeviceConfig& cfg); From 7b546d0ed7ecd86fd16cdbc0af53fe54631be88f Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 13:49:05 -0700 Subject: [PATCH 11/17] feat(firmware): enable task watchdog on camera/reporter/loop tasks 30s TWDT subscribes all three long-running tasks and panics on hang. The reporter task's retry loop explicitly feeds between attempts so the 3-try sequence (worst case 52s) does not itself trip the dog. Reset reason on next boot is visible via esp_reset_reason() which EVT_BOOT already logs. --- firmware/src/main.cpp | 12 ++++++++++++ firmware/src/reporter.cpp | 9 ++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp index 4cb595d..b3eea6c 100644 --- a/firmware/src/main.cpp +++ b/firmware/src/main.cpp @@ -11,6 +11,7 @@ #include "event_log.h" #include "net_guard.h" #include +#include // LED on GPIO2 (TimerCamera-F built-in LED) — verify against board schematic // Factory reset: hold GPIO37 (BOOT button) for 5 seconds @@ -53,6 +54,7 @@ static void check_factory_reset() { ESP.restart(); } delay(50); + esp_task_wdt_reset(); } } @@ -60,6 +62,7 @@ static void check_factory_reset() { static void task_camera(void*) { static uint8_t frame[CV_PIXELS]; // static: avoids 9KB on task stack int last_logged_track_id = 0; // diagnostic: log each new track once + esp_task_wdt_add(nullptr); while (true) { if (camera_capture_96(frame)) { if (xSemaphoreTake(s_cv_mutex, pdMS_TO_TICKS(100)) == pdTRUE) { @@ -86,15 +89,18 @@ static void task_camera(void*) { } } vTaskDelay(pdMS_TO_TICKS(CAM_INTERVAL_MS)); + esp_task_wdt_reset(); } } // Hourly reporter task — runs on core 0 static void task_reporter(void*) { uint32_t last_report_ts = 0; // 0 = not initialized yet + esp_task_wdt_add(nullptr); while (true) { vTaskDelay(pdMS_TO_TICKS(10000)); // check every 10s + esp_task_wdt_reset(); uint32_t now = (uint32_t)(time(nullptr)); if (now < 1700000000UL) continue; // NTP not synced @@ -203,11 +209,17 @@ void setup() { s_cv_mutex = xSemaphoreCreateMutex(); + // Task watchdog: 30s timeout, panic on trigger so we reboot and log + // via esp_reset_reason() in EVT_BOOT on the next boot. + esp_task_wdt_init(30, /*panic=*/true); + esp_task_wdt_add(nullptr); // subscribe the Arduino loopTask + xTaskCreatePinnedToCore(task_camera, "cam", 8192, nullptr, 2, nullptr, 1); xTaskCreatePinnedToCore(task_reporter, "rep", 8192, nullptr, 1, nullptr, 0); } void loop() { + esp_task_wdt_reset(); ArduinoOTA.handle(); check_factory_reset(); net_guard_tick(); diff --git a/firmware/src/reporter.cpp b/firmware/src/reporter.cpp index a980a13..8880f4c 100644 --- a/firmware/src/reporter.cpp +++ b/firmware/src/reporter.cpp @@ -8,6 +8,7 @@ #include #include #include +#include static std::vector s_cam_buf; static std::vector s_ble_buf; @@ -53,12 +54,14 @@ static bool post_json_once(const DeviceConfig& cfg, const char* path, const Stri } static bool post_json(const DeviceConfig& cfg, const char* path, const String& body) { - // 3 attempts with 0/2s/5s delays. Total worst case ~ 2x(5+10)s + 7s = 37s; - // acceptable for an hourly task and within the 30s TWDT once Task 6 lands - // IF TWDT is fed between attempts (see Task 6). + // 3 attempts. Worst case per call: 3 × (5s connect + 10s response) + 0 + 2 + 5 = 52s. + // TWDT is fed before the backoff delay and before each attempt so the 30s + // timeout doesn't fire mid-sequence. static const uint16_t DELAYS_MS[] = { 0, 2000, 5000 }; for (int i = 0; i < 3; i++) { + esp_task_wdt_reset(); if (DELAYS_MS[i]) vTaskDelay(pdMS_TO_TICKS(DELAYS_MS[i])); + esp_task_wdt_reset(); if (post_json_once(cfg, path, body)) return true; } return false; From f08f70a8fb27a6edd0fc8de9b452f11650eeb501 Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 13:52:07 -0700 Subject: [PATCH 12/17] feat(firmware): software heartbeat-miss watchdog reboots after 6h offline Reporter task counts consecutive heartbeat failures from the bool returned by reporter_heartbeat (Task 5). After 6 consecutive misses (~6 hours at the hourly cadence) the device logs EVT_HEARTBEAT_MISS then EVT_REBOOT(REBOOT_HEARTBEAT_MISS) and restarts, giving the whole network stack a clean reinitialization. The 200ms delay before the restart lets NVS commit the REBOOT entry so the next boot can report it via EVT_BOOT + esp_reset_reason(). --- firmware/src/main.cpp | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp index b3eea6c..9767a87 100644 --- a/firmware/src/main.cpp +++ b/firmware/src/main.cpp @@ -137,10 +137,24 @@ static void task_reporter(void*) { reporter_submit_camera(g_cfg, cam_rec); reporter_submit_ble(g_cfg, ble_rec); - reporter_heartbeat(g_cfg, millis() / 1000, WiFi.RSSI()); + bool hb_ok = reporter_heartbeat(g_cfg, millis() / 1000, WiFi.RSSI()); ble_scanner_reinit(); led_set(false); + + static uint8_t consecutive_misses = 0; + if (hb_ok) { + consecutive_misses = 0; + } else { + consecutive_misses++; + event_log_write(EVT_HEARTBEAT_MISS, consecutive_misses, 0); + Serial.printf("[WDG] heartbeat miss %u/6\n", consecutive_misses); + if (consecutive_misses >= 6) { + event_log_write(EVT_REBOOT, REBOOT_HEARTBEAT_MISS, 0); + delay(200); // let NVS commit before reboot + ESP.restart(); + } + } } } From 5c9f5df0ceace3deb7a53af96083d7970e390d41 Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 13:54:55 -0700 Subject: [PATCH 13/17] feat(firmware): include diagnostics in heartbeat payload Heartbeat v1.1.0 now carries heap stats (free + min_free since boot), esp_reset_reason(), last WiFi disconnect code, and the last 8 persisted event-log entries. Makes field failures diagnosable server-side without retrieving the device: the post-reboot heartbeat will include EVT_BOOT with reset reason and whatever EVT_WIFI_DOWN or EVT_HTTP_FAIL entries preceded it. --- firmware/src/reporter.cpp | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/firmware/src/reporter.cpp b/firmware/src/reporter.cpp index 8880f4c..b3ed72b 100644 --- a/firmware/src/reporter.cpp +++ b/firmware/src/reporter.cpp @@ -2,6 +2,7 @@ #include "reporter.h" #include "hmac.h" #include "event_log.h" +#include "net_guard.h" #include #include #include @@ -9,6 +10,8 @@ #include #include #include +#include +#include static std::vector s_cam_buf; static std::vector s_ble_buf; @@ -175,11 +178,31 @@ void reporter_submit_ble(const DeviceConfig& cfg, const BLEHourlyRecord& rec) { bool reporter_heartbeat(const DeviceConfig& cfg, uint32_t uptime_s, int wifi_rssi) { JsonDocument doc; doc["device_id"] = cfg.device_id; - doc["firmware_version"] = "1.0.0"; + doc["firmware_version"] = "1.1.0"; doc["free_storage_pct"] = 100; doc["wifi_rssi"] = wifi_rssi; doc["pending_records"] = (int)(s_cam_buf.size() + s_ble_buf.size()); doc["uptime_seconds"] = uptime_s; + + // Diagnostics (new in 1.1.0) + doc["reset_reason"] = (int)esp_reset_reason(); + doc["heap_free"] = (int)esp_get_free_heap_size(); + doc["heap_min_free"] = (int)esp_get_minimum_free_heap_size(); + doc["last_disconnect_code"] = (int)net_guard_last_disconnect_reason(); + + // Last 8 event-log entries, newest first + EventLogEntry recent[8]; + size_t n = event_log_read_recent(recent, 8); + JsonArray evs = doc["recent_events"].to(); + for (size_t i = 0; i < n; i++) { + JsonObject e = evs.add(); + e["t"] = recent[i].tag; + e["d0"] = recent[i].data0; + e["d1"] = recent[i].data1; + e["ts"] = recent[i].ts_unix; + e["up"] = recent[i].uptime_s; + } + String body; serializeJson(doc, body); return post_json(cfg, "/api/v1/heartbeat", body); } From 867e90b1f61fbe907002852c1be51baf794730b1 Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 13:59:31 -0700 Subject: [PATCH 14/17] feat(server): heartbeat-diagnostics stub + migration for real server import The real server lives in a separate repo; this repo carries reference stubs for each endpoint (see camera_endpoint.py precedent). Adds the Pydantic extension, persistence helper, migration 005, and tests that the real server can copy when adding diagnostic-field support. Matches the firmware v1.1.0 heartbeat payload shape. Old-shape payloads (firmware v1.0.0) continue to parse cleanly with the new fields defaulting to None. --- server/heartbeat_diagnostics_stub.py | 125 ++++++++++++++ .../migrations/005_heartbeat_diagnostics.sql | 14 ++ server/test_heartbeat_diagnostics_stub.py | 156 ++++++++++++++++++ 3 files changed, 295 insertions(+) create mode 100644 server/heartbeat_diagnostics_stub.py create mode 100644 server/migrations/005_heartbeat_diagnostics.sql create mode 100644 server/test_heartbeat_diagnostics_stub.py diff --git a/server/heartbeat_diagnostics_stub.py b/server/heartbeat_diagnostics_stub.py new file mode 100644 index 0000000..ea2a714 --- /dev/null +++ b/server/heartbeat_diagnostics_stub.py @@ -0,0 +1,125 @@ +# server/heartbeat_diagnostics_stub.py +# Add these models and the persistence helper to the server's main.py alongside +# the existing heartbeat endpoint (POST /api/v1/heartbeat). +# Requires: diagnostic columns on the heartbeats table (see migrations/005_heartbeat_diagnostics.sql) +# +# Firmware v1.1.0 extends the heartbeat payload with five optional diagnostic +# fields. v1.0.0-shape payloads (without these fields) must continue to parse +# cleanly — every new field is Optional and defaults to None. +# +# IMPORTANT: Adjust the table name in store_heartbeat_diagnostics to match the +# real server's schema if it differs from "heartbeats". + +import json +import sqlite3 +from typing import List, Optional + +from pydantic import BaseModel + + +class RecentEvent(BaseModel): + t: int # EventLogTag (see EVENT_TAG_DECODER) + d0: int # tag-specific datum 0 + d1: int # tag-specific datum 1 + ts: int # unix timestamp (seconds) + up: int # seconds since boot when event was logged + + +# Extend the existing HeartbeatRequest model in main.py by adding these five +# optional fields. The rest of the heartbeat model (device_id, uptime, etc.) +# stays as-is. Shown here as a standalone model for reference/testing. +class HeartbeatDiagnosticsFields(BaseModel): + reset_reason: Optional[int] = None + heap_free: Optional[int] = None + heap_min_free: Optional[int] = None + last_disconnect_code: Optional[int] = None + recent_events: Optional[List[RecentEvent]] = None + + +# Example of the fully-extended heartbeat request model (merge into the +# existing HeartbeatRequest in main.py rather than introducing a second class): +class HeartbeatRequestWithDiagnostics(BaseModel): + device_id: str + uptime: int + # ... existing fields from the v1.0.0 heartbeat model go here ... + # New v1.1.0 diagnostic fields: + reset_reason: Optional[int] = None + heap_free: Optional[int] = None + heap_min_free: Optional[int] = None + last_disconnect_code: Optional[int] = None + recent_events: Optional[List[RecentEvent]] = None + + +# Call this inside the existing receive_heartbeat handler after the base +# heartbeat row has been inserted/updated. It persists the diagnostic fields +# on the same row keyed by device_id. +def store_heartbeat_diagnostics( + db: sqlite3.Connection, + device_id: str, + hb: HeartbeatRequestWithDiagnostics, +) -> None: + """Persist the v1.1.0 diagnostic fields onto the heartbeats row for device_id. + + recent_events is JSON-serialized into a TEXT column for flexibility; + the other four fields are stored as INTEGERs. All fields are nullable + and left untouched when the payload omits them (v1.0.0 compatibility). + """ + recent_events_json = ( + json.dumps([ev.model_dump() for ev in hb.recent_events]) + if hb.recent_events is not None + else None + ) + cursor = db.cursor() + cursor.execute( + """UPDATE heartbeats + SET reset_reason = ?, + heap_free = ?, + heap_min_free = ?, + last_disconnect_code = ?, + recent_events = ? + WHERE device_id = ?""", + ( + hb.reset_reason, + hb.heap_free, + hb.heap_min_free, + hb.last_disconnect_code, + recent_events_json, + device_id, + ), + ) + db.commit() + + +# --------------------------------------------------------------------------- +# Decoders — use these in dashboards / alerting to label the integer tags the +# firmware emits. Keep in sync with firmware/include/event_log.h. +# --------------------------------------------------------------------------- + +# EventLogTag values (RecentEvent.t) -> human name. +# Per-tag interpretation of d0/d1: +# EVT_BOOT d0=esp_reset_reason() +# EVT_WIFI_UP d0=RSSI (int16 cast to uint16) +# EVT_WIFI_DOWN d0=disconnect reason (0xFF = silent-death) +# EVT_HTTP_OK d0=path_hash, d1=elapsed_ms +# EVT_HTTP_FAIL d0=path_hash, d1=http_status_or_errno +# EVT_HEARTBEAT_MISS d0=consecutive_count +# EVT_NTP_SYNC d0=seconds_since_boot (reserved, not emitted) +# EVT_REBOOT d0=RebootReason (see REBOOT_REASON_DECODER) +EVENT_TAG_DECODER = { + 1: "EVT_BOOT", + 2: "EVT_WIFI_UP", + 3: "EVT_WIFI_DOWN", + 4: "EVT_HTTP_OK", + 5: "EVT_HTTP_FAIL", + 6: "EVT_HEARTBEAT_MISS", + 7: "EVT_NTP_SYNC", + 8: "EVT_REBOOT", +} + +# EVT_REBOOT.d0 values -> human name. Firmware-initiated reboot reasons. +REBOOT_REASON_DECODER = { + 1: "HEARTBEAT_MISS", + 2: "FACTORY_RESET", + 3: "OTA", + 4: "WIFI_REPROV", +} diff --git a/server/migrations/005_heartbeat_diagnostics.sql b/server/migrations/005_heartbeat_diagnostics.sql new file mode 100644 index 0000000..9a806ad --- /dev/null +++ b/server/migrations/005_heartbeat_diagnostics.sql @@ -0,0 +1,14 @@ +-- migrations/005_heartbeat_diagnostics.sql +-- Add v1.1.0 diagnostic columns to the existing heartbeats table. +-- Adjust the table name ("heartbeats") to match the real server's schema. +-- Apply: sqlite3 < migrations/005_heartbeat_diagnostics.sql +-- +-- sqlite's ALTER TABLE ADD COLUMN only takes one column per statement, so +-- each field is added separately. All columns are nullable, so firmware +-- v1.0.0 payloads (which omit these fields) remain accepted unchanged. + +ALTER TABLE heartbeats ADD COLUMN reset_reason INTEGER; +ALTER TABLE heartbeats ADD COLUMN heap_free INTEGER; +ALTER TABLE heartbeats ADD COLUMN heap_min_free INTEGER; +ALTER TABLE heartbeats ADD COLUMN last_disconnect_code INTEGER; +ALTER TABLE heartbeats ADD COLUMN recent_events TEXT; -- JSON-serialized list of {t,d0,d1,ts,up} diff --git a/server/test_heartbeat_diagnostics_stub.py b/server/test_heartbeat_diagnostics_stub.py new file mode 100644 index 0000000..cbf72fd --- /dev/null +++ b/server/test_heartbeat_diagnostics_stub.py @@ -0,0 +1,156 @@ +# server/test_heartbeat_diagnostics_stub.py +# Template tests for the heartbeat diagnostic-fields extension. +# Adapt imports and fixtures to match the actual server's test structure. +# +# To run against the actual server (once integrated): +# pytest server/test_heartbeat_diagnostics_stub.py -v + +import json +import sqlite3 + + +def _make_db() -> sqlite3.Connection: + """In-memory sqlite fixture matching migrations/005_heartbeat_diagnostics.sql + applied on top of a minimal heartbeats table.""" + db = sqlite3.connect(":memory:") + db.execute(""" + CREATE TABLE heartbeats ( + device_id TEXT PRIMARY KEY, + uptime INTEGER, + reset_reason INTEGER, + heap_free INTEGER, + heap_min_free INTEGER, + last_disconnect_code INTEGER, + recent_events TEXT + ) + """) + db.commit() + return db + + +def _v10_payload() -> dict: + """Firmware v1.0.0-shape heartbeat: no diagnostic fields.""" + return {"device_id": "dc-test-01", "uptime": 12345} + + +def _v11_payload() -> dict: + """Firmware v1.1.0-shape heartbeat: includes all five diagnostic fields.""" + return { + "device_id": "dc-test-01", + "uptime": 12345, + "reset_reason": 1, + "heap_free": 123456, + "heap_min_free": 100000, + "last_disconnect_code": 201, + "recent_events": [ + {"t": 1, "d0": 1, "d1": 0, "ts": 1712000000, "up": 0}, + {"t": 3, "d0": 255, "d1": 0, "ts": 1712000050, "up": 50}, + ], + } + + +def test_v10_shape_parses_with_new_fields_none(): + """A v1.0.0 heartbeat (no diagnostic fields) must parse cleanly; all new + fields default to None.""" + from server.heartbeat_diagnostics_stub import HeartbeatRequestWithDiagnostics + + hb = HeartbeatRequestWithDiagnostics(**_v10_payload()) + assert hb.device_id == "dc-test-01" + assert hb.uptime == 12345 + assert hb.reset_reason is None + assert hb.heap_free is None + assert hb.heap_min_free is None + assert hb.last_disconnect_code is None + assert hb.recent_events is None + + +def test_v11_shape_populates_new_fields(): + """A v1.1.0 heartbeat populates each diagnostic field and the event list.""" + from server.heartbeat_diagnostics_stub import HeartbeatRequestWithDiagnostics + + hb = HeartbeatRequestWithDiagnostics(**_v11_payload()) + assert hb.reset_reason == 1 + assert hb.heap_free == 123456 + assert hb.heap_min_free == 100000 + assert hb.last_disconnect_code == 201 + assert hb.recent_events is not None + assert len(hb.recent_events) == 2 + assert hb.recent_events[0].t == 1 + assert hb.recent_events[1].t == 3 + assert hb.recent_events[1].d0 == 255 # 0xFF silent-death marker + assert hb.recent_events[1].ts == 1712000050 + + +def test_store_heartbeat_diagnostics_writes_fields_and_json(): + """store_heartbeat_diagnostics must JSON-serialize recent_events and write + each integer field as submitted.""" + from server.heartbeat_diagnostics_stub import ( + HeartbeatRequestWithDiagnostics, + store_heartbeat_diagnostics, + ) + + db = _make_db() + # Seed the heartbeats row the base handler would have inserted first. + db.execute( + "INSERT INTO heartbeats (device_id, uptime) VALUES (?, ?)", + ("dc-test-01", 12345), + ) + db.commit() + + hb = HeartbeatRequestWithDiagnostics(**_v11_payload()) + store_heartbeat_diagnostics(db, "dc-test-01", hb) + + row = db.execute( + """SELECT reset_reason, heap_free, heap_min_free, + last_disconnect_code, recent_events + FROM heartbeats + WHERE device_id = ?""", + ("dc-test-01",), + ).fetchone() + assert row[0] == 1 + assert row[1] == 123456 + assert row[2] == 100000 + assert row[3] == 201 + events = json.loads(row[4]) + assert isinstance(events, list) + assert len(events) == 2 + assert events[0] == {"t": 1, "d0": 1, "d1": 0, "ts": 1712000000, "up": 0} + assert events[1]["d0"] == 255 + + +def test_store_heartbeat_diagnostics_v10_leaves_fields_null(): + """v1.0.0 payload: all diagnostic columns should remain NULL after store.""" + from server.heartbeat_diagnostics_stub import ( + HeartbeatRequestWithDiagnostics, + store_heartbeat_diagnostics, + ) + + db = _make_db() + db.execute( + "INSERT INTO heartbeats (device_id, uptime) VALUES (?, ?)", + ("dc-test-01", 12345), + ) + db.commit() + + hb = HeartbeatRequestWithDiagnostics(**_v10_payload()) + store_heartbeat_diagnostics(db, "dc-test-01", hb) + + row = db.execute( + """SELECT reset_reason, heap_free, heap_min_free, + last_disconnect_code, recent_events + FROM heartbeats + WHERE device_id = ?""", + ("dc-test-01",), + ).fetchone() + assert row == (None, None, None, None, None) + + +def test_event_tag_decoder_labels(): + """Sanity check: decoder maps firmware tag values to the expected names.""" + from server.heartbeat_diagnostics_stub import EVENT_TAG_DECODER, REBOOT_REASON_DECODER + + assert EVENT_TAG_DECODER[1] == "EVT_BOOT" + assert EVENT_TAG_DECODER[3] == "EVT_WIFI_DOWN" + assert EVENT_TAG_DECODER[8] == "EVT_REBOOT" + assert REBOOT_REASON_DECODER[1] == "HEARTBEAT_MISS" + assert REBOOT_REASON_DECODER[4] == "WIFI_REPROV" From 2d95069bd1a7294770847cddeb753f15ae65f674 Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 14:02:09 -0700 Subject: [PATCH 15/17] docs: network-resilience firmware 1.1 deployment + field diagnostic guide Flash command, expected first-boot behavior, per-feature summary of the 1.1 release, 24-hour field-check playbook, and a reference table for decoding the heartbeat's recent_events array. --- README.md | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/README.md b/README.md index f99b0a2..7237d71 100644 --- a/README.md +++ b/README.md @@ -197,3 +197,70 @@ Capture a boot log with timestamps: ```bash python tools/serial_monitor.py --port /dev/ttyUSB0 --reset --timestamp --seconds 30 ``` + +## Deploying firmware 1.1 (network resilience) + +### Flash command + +```bash +cd firmware && pio run -e timercam -t upload +``` + +### Expected first boot + +On the serial log (115200 baud), the device prints the boot banner, then +initializes `event_log`, then records the reset reason via `EVT_BOOT`. +The first heartbeat fires roughly 60-70s after power-on (15s WiFi +busy-wait + NTP sync + 60s `BOOT_REPORT_DELAY_S`). Monitor with +`pio device monitor` or: + +```bash +python tools/serial_monitor.py --port /dev/ttyUSB0 --reset --timestamp --seconds 90 +``` + +### What's new in 1.1 + +- Event-driven WiFi reconnect with 1s→60s exponential backoff (`net_guard` module); disconnect reasons logged. +- HTTP timeouts (5s connect / 10s response) + 3-try retry on every POST. +- ESP-IDF Task Watchdog (30s) on camera, reporter, and loop tasks; panic → reboot → reason surfaces in the next heartbeat. +- Software heartbeat-miss watchdog: 6 consecutive missed heartbeats (~6 h) triggers a clean reboot. +- Persistent NVS event-log ring buffer (32 entries) surfaced in the heartbeat's `recent_events` field. +- New heartbeat fields: `reset_reason`, `heap_free`, `heap_min_free`, `last_disconnect_code`, `recent_events`. + +### 24-hour field checks + +After deploying a device, run through this checklist against the server's +heartbeat records at the 24-hour mark: + +- **Heartbeat count ≥ 22** — ≥ 92% uptime across 24 h at the hourly cadence. +- **No sustained `t=6` (EVT_HEARTBEAT_MISS) entries in `recent_events`** — transient singletons are expected; repeated misses indicate a sticky network problem worth investigating. +- **`heap_min_free` stable day over day** — a downward drift indicates a leak. Alert threshold: min-free drops by more than 20% vs baseline. +- **`last_disconnect_code` matches known AP behavior** — reason 8 (assoc lost) and reason 15 (4-way handshake timeout) are common on busy APs; recurring reason 200+ indicates a firmware bug. +- **`reset_reason` has no unexpected values** — see table below. + +| `reset_reason` | Meaning | Expected? | +|----------------|---------|-----------| +| 1 | Power-on | Normal immediately after a deployment. | +| 4 | Software reset (our `ESP.restart()`) | Correlate with `EVT_REBOOT` in `recent_events`. | +| 6 | Task watchdog | Investigate — a task hung for 30s. | +| 7 | Brownout | Investigate power supply / USB cable. | +| 8 | SDIO reset | Unusual — investigate. | + +### Decoding recent_events + +The `recent_events` array is a ring buffer of `{t, d0, d1, ts}` entries. +Tag definitions live in `firmware/lib/event_log/event_log.h`: + +| `t` | Event | `d0` | `d1` | +|-----|-------|------|------| +| 1 | `EVT_BOOT` | `esp_reset_reason()` | — | +| 2 | `EVT_WIFI_UP` | RSSI | — | +| 3 | `EVT_WIFI_DOWN` | disconnect reason code; `0xFF` = silent-death fallback | — | +| 4 | `EVT_HTTP_OK` | fnv1a-16 path hash | elapsed ms (capped at 65535) | +| 5 | `EVT_HTTP_FAIL` | path hash | HTTP status or negative errno cast to `uint16` | +| 6 | `EVT_HEARTBEAT_MISS` | consecutive miss count | — | +| 7 | `EVT_NTP_SYNC` | reserved | — | +| 8 | `EVT_REBOOT` | `RebootReason`: 1=HEARTBEAT_MISS, 2=FACTORY_RESET, 3=OTA, 4=WIFI_REPROV | — | + +Server-side decoder tables (`EVENT_TAG_DECODER`, `REBOOT_REASON_DECODER`) +live in `server/heartbeat_diagnostics_stub.py`. From d943b3df5ac163c32f1ccc6d734eea3a34665c4c Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 14:03:57 -0700 Subject: [PATCH 16/17] feat(firmware): log reason before FATAL hang loops Two FATAL while(true) hangs in main.cpp (config load fail, camera init fail) previously relied on the hardware watchdog to reboot the device, leaving the cause invisible beyond a generic TWDT reset reason. Now each path logs EVT_REBOOT with REBOOT_FATAL_CONFIG or REBOOT_FATAL_CAMERA before hanging, so the next heartbeat's recent_events surfaces which branch hung. Server-side decoder updated for the two new enum values. Co-Authored-By: Claude Opus 4.7 (1M context) --- firmware/lib/event_log/event_log.h | 2 ++ firmware/src/main.cpp | 2 ++ server/heartbeat_diagnostics_stub.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/firmware/lib/event_log/event_log.h b/firmware/lib/event_log/event_log.h index 470f45a..6f68978 100644 --- a/firmware/lib/event_log/event_log.h +++ b/firmware/lib/event_log/event_log.h @@ -19,6 +19,8 @@ enum RebootReason : uint8_t { REBOOT_FACTORY_RESET = 2, REBOOT_OTA = 3, REBOOT_WIFI_REPROV = 4, + REBOOT_FATAL_CONFIG = 5, + REBOOT_FATAL_CAMERA = 6, }; struct EventLogEntry { diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp index 9767a87..6f5576c 100644 --- a/firmware/src/main.cpp +++ b/firmware/src/main.cpp @@ -169,6 +169,7 @@ void setup() { if (!config_load(g_cfg)) { Serial.println("FATAL: device_id/location_id/hmac_secret not provisioned"); + event_log_write(EVT_REBOOT, REBOOT_FATAL_CONFIG, 0); while (true) { delay(500); led_set(!digitalRead(LED_PIN)); } // fast blink } @@ -203,6 +204,7 @@ void setup() { if (!camera_init()) { Serial.println("FATAL: camera init failed"); + event_log_write(EVT_REBOOT, REBOOT_FATAL_CAMERA, 0); while (true) delay(1000); } diff --git a/server/heartbeat_diagnostics_stub.py b/server/heartbeat_diagnostics_stub.py index ea2a714..30e7943 100644 --- a/server/heartbeat_diagnostics_stub.py +++ b/server/heartbeat_diagnostics_stub.py @@ -122,4 +122,6 @@ REBOOT_REASON_DECODER = { 2: "FACTORY_RESET", 3: "OTA", 4: "WIFI_REPROV", + 5: "FATAL_CONFIG", + 6: "FATAL_CAMERA", } From a795cfa0ade47d3433b772136569a760bea25b37 Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 14:10:32 -0700 Subject: [PATCH 17/17] fix(firmware): reboot on FATAL failures + emit NTP_SYNC + server-coord warning - Config-load and camera-init FATAL branches now reboot (3s LED signal before restart) instead of hanging forever. Matches the enum name REBOOT_FATAL_* and makes camera-init failures diagnosable via the next boot's heartbeat recent_events. Config failures produce a visible reboot loop rather than a silent hang. - Emit EVT_NTP_SYNC(seconds_since_boot) on the first NTP-synced reporter iteration so slow / failed NTP sync is a visible signal in the heartbeat's recent_events window. - README "Deploying firmware 1.1" now opens with a "Before you flash" warning directing the operator to land server-side heartbeat schema changes first (migration 005 + stub integration) to avoid a strict-schema 4xx reboot loop after deployment. --- README.md | 22 ++++++++++++++++++++++ firmware/src/main.cpp | 14 ++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7237d71..1dfcb0e 100644 --- a/README.md +++ b/README.md @@ -200,6 +200,28 @@ python tools/serial_monitor.py --port /dev/ttyUSB0 --reset --timestamp --seconds ## Deploying firmware 1.1 (network resilience) +### Before you flash + +Firmware 1.1 adds five new fields to the `POST /api/v1/heartbeat` payload +(`reset_reason`, `heap_free`, `heap_min_free`, `last_disconnect_code`, +`recent_events`). **The real server must accept these optional fields before +you deploy firmware 1.1**, or strict-schema validation will 4xx every +heartbeat; after 6 consecutive misses (~6h) the heartbeat-miss watchdog +will reboot the device, producing a reboot loop. + +Reference migration and handler code for the real server are in this repo: + +- `server/heartbeat_diagnostics_stub.py` — Pydantic model extensions, + `store_heartbeat_diagnostics()` helper, and `EVENT_TAG_DECODER` / + `REBOOT_REASON_DECODER` reference tables. +- `server/migrations/005_heartbeat_diagnostics.sql` — adds five nullable + columns to the `heartbeats` table (adjust table name to match the real + server's schema). + +Copy the stub additions into the production server repo, run the +migration, and confirm a v1.1.0-shape heartbeat returns 200 before you +flash any device. + ### Flash command ```bash diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp index 6f5576c..765ac6c 100644 --- a/firmware/src/main.cpp +++ b/firmware/src/main.cpp @@ -107,6 +107,7 @@ static void task_reporter(void*) { // First valid timestamp — schedule boot report 60s from now if (last_report_ts == 0) { + event_log_write(EVT_NTP_SYNC, (uint16_t)(millis() / 1000), 0); last_report_ts = now - (REPORT_INTERVAL_S - BOOT_REPORT_DELAY_S); continue; } @@ -170,7 +171,14 @@ void setup() { if (!config_load(g_cfg)) { Serial.println("FATAL: device_id/location_id/hmac_secret not provisioned"); event_log_write(EVT_REBOOT, REBOOT_FATAL_CONFIG, 0); - while (true) { delay(500); led_set(!digitalRead(LED_PIN)); } // fast blink + // Blink fast for 3s so a physically-present operator can see it, + // then reboot so EVT_BOOT history on the next heartbeat surfaces + // the failure — though in this case the device can't heartbeat + // without config, so the real signal is the fast-blink-then-reboot + // cycle visible on the LED. + uint32_t t0 = millis(); + while (millis() - t0 < 3000) { led_set(!digitalRead(LED_PIN)); delay(100); } + ESP.restart(); } // Connect to WiFi @@ -205,7 +213,9 @@ void setup() { if (!camera_init()) { Serial.println("FATAL: camera init failed"); event_log_write(EVT_REBOOT, REBOOT_FATAL_CAMERA, 0); - while (true) delay(1000); + uint32_t t0 = millis(); + while (millis() - t0 < 3000) { led_set(!digitalRead(LED_PIN)); delay(100); } + ESP.restart(); } reporter_init();