From a585a56cff5067278a494842af2baabd6582c26f Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Fri, 1 May 2026 11:34:17 -0700 Subject: [PATCH] fix(firmware): upgrade NimBLE to 2.x + DNS fallback for unreliable resolvers NimBLE-Arduino 1.4.2 had an init/fire race in its FreeRTOS callout porting layer where os_callout_timer_cb dispatched a queued TimerHandle expiry against a not-yet-initialized event (NULL fn pointer), causing PC=0 InstrFetchProhibited within ~1s of boot when the camera task starved the timer service. Confirmed by ets_printf instrumentation. Upgrading to ^2.0.0 rewrites the porting layer and eliminates the race; verified clean on the customer network for 1+ hour. Also rolls in DNS-resilience work that surfaced the BLE crash during provisioning: pin lwIP/esp-netif resolvers to 1.1.1.1/8.8.8.8 across DHCP renewals, add three-tier resolver fallback in reporter with a hardcoded IP of last resort, and switch to raw WiFiClient with manual Host header to bypass HTTPClient's brittle DNS path. Migration touches for NimBLE 2.x: - NimBLEAdvertisedDeviceCallbacks -> NimBLEScanCallbacks - onResult signature now takes const NimBLEAdvertisedDevice* - setAdvertisedDeviceCallbacks -> setScanCallbacks - start(0, nullptr, false) -> start(0, false, false) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../plans/2026-04-23-network-resilience.md | 1032 +++++++++++++++++ .../plans/2026-05-01-ble-nimble-crash.md | 189 +++ firmware/lib/net_guard/net_guard.cpp | 63 + firmware/lib/net_guard/net_guard.h | 9 + firmware/platformio.ini | 2 +- firmware/src/ble_scanner.cpp | 11 +- firmware/src/main.cpp | 47 +- firmware/src/reporter.cpp | 103 +- firmware/src/reporter.h | 9 +- 9 files changed, 1434 insertions(+), 31 deletions(-) create mode 100644 docs/superpowers/plans/2026-04-23-network-resilience.md create mode 100644 docs/superpowers/plans/2026-05-01-ble-nimble-crash.md diff --git a/docs/superpowers/plans/2026-04-23-network-resilience.md b/docs/superpowers/plans/2026-04-23-network-resilience.md new file mode 100644 index 0000000..c425090 --- /dev/null +++ b/docs/superpowers/plans/2026-04-23-network-resilience.md @@ -0,0 +1,1032 @@ +# Network Resilience Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Harden the DoorCounter firmware (ESP32 TimerCam) against silent WiFi/HTTP failures observed in the field, and add persistent diagnostic logging so the next failure is post-hoc diagnosable without serial access. + +**Architecture:** Add two new focused modules — `event_log` (NVS ring buffer of boot/wifi/http events) and `net_guard` (event-driven WiFi reconnect + exponential backoff). Retrofit `reporter.cpp` with HTTP timeouts and retry. Enable ESP-IDF Task Watchdog (TWDT) on the two long-running tasks. Add a software-level heartbeat-miss watchdog that reboots after N consecutive failures. Extend the heartbeat payload with diagnostic fields (heap, reboot reason, last disconnect code, recent events) so server-side observability improves without requiring field visits. + +**Tech Stack:** Arduino-ESP32 (platform-espressif32@6.6.0), NimBLE-Arduino, HTTPClient, ArduinoJson 7, Preferences (NVS), `esp_task_wdt.h`, `esp_system.h`, Unity (PlatformIO `test_native` env). + +**Out of scope:** CV pipeline changes, BLE coexistence retuning, TLS/HTTPS migration, OTA hardening. + +--- + +## File Structure + +**New files:** +- `firmware/src/event_log.h` / `firmware/src/event_log.cpp` — NVS-backed 32-entry ring buffer of tagged events (boot, reboot_reason, wifi_up, wifi_down, http_ok, http_fail, heartbeat_miss). Each entry is a fixed 32-byte record. +- `firmware/src/net_guard.h` / `firmware/src/net_guard.cpp` — `WiFi.onEvent()` handler + connection state machine with exponential backoff (1s → 2s → 4s → ... → 60s cap). Pure policy function `net_guard_next_backoff_ms(attempt)` is unit-tested natively. +- `firmware/test/test_native/test_net_guard.cpp` — Unity tests for backoff policy. +- `firmware/test/test_native/test_event_log.cpp` — Unity tests for ring-buffer encode/decode on a heap-backed NVS stub. + +**Modified files:** +- `firmware/src/reporter.cpp` — add connect/response timeouts, add retry wrapper, log each POST attempt to event_log, return status from `reporter_heartbeat`. +- `firmware/src/reporter.h` — change `reporter_heartbeat` signature to return `bool`. +- `firmware/src/main.cpp` — boot-event logging, wire `net_guard`, enable TWDT, add heartbeat-miss watchdog, remove polling reconnect from `loop()`. +- `firmware/platformio.ini` — add `test_native` build flags for stubbing ESP headers if needed. + +--- + +## Task 1: Event log module — encoding and ring-buffer policy (pure, unit-tested) + +**Files:** +- Create: `firmware/src/event_log.h` +- Create: `firmware/src/event_log.cpp` +- Test: `firmware/test/test_native/test_event_log.cpp` + +**Why first:** every other task logs through this module. Landing it first with unit tests gives the rest of the plan a stable dependency. + +- [ ] **Step 1: Write the header** + +Create `firmware/src/event_log.h`: + +```cpp +// firmware/src/event_log.h +#pragma once +#include +#include + +enum EventLogTag : uint8_t { + EVT_BOOT = 1, // data0 = esp_reset_reason() value + EVT_WIFI_UP = 2, // data0 = rssi (signed, cast) + EVT_WIFI_DOWN = 3, // data0 = disconnect reason code + EVT_HTTP_OK = 4, // data0 = path hash (fnv1a16), data1 = elapsed_ms + EVT_HTTP_FAIL = 5, // data0 = path hash, data1 = (http_code or negative errno) + EVT_HEARTBEAT_MISS = 6, // data0 = consecutive miss count + EVT_NTP_SYNC = 7, // data0 = seconds since boot + EVT_REBOOT = 8, // data0 = reason enum (defined below) +}; + +enum RebootReason : uint8_t { + REBOOT_HEARTBEAT_MISS = 1, + REBOOT_FACTORY_RESET = 2, + REBOOT_OTA = 3, + REBOOT_WIFI_REPROV = 4, +}; + +struct EventLogEntry { + uint32_t ts_unix; // 0 if NTP not synced yet; fall back to millis/1000 + uint32_t uptime_s; // millis()/1000 at log time + uint16_t data0; + uint16_t data1; + uint8_t tag; // EventLogTag + uint8_t seq; // rolling sequence, wraps + uint8_t _pad[14]; // pad to 32 bytes for fixed slot size +} __attribute__((packed)); +static_assert(sizeof(EventLogEntry) == 32, "EventLogEntry must be 32 bytes"); + +// NVS-backed 32-slot ring buffer. Safe to call before NTP sync. +void event_log_init(); +void event_log_write(EventLogTag tag, uint16_t data0 = 0, uint16_t data1 = 0); +size_t event_log_read_recent(EventLogEntry* out, size_t max_entries); +uint16_t event_log_path_hash(const char* path); // fnv1a16 — exposed for tests +``` + +- [ ] **Step 2: Write failing tests** + +Create `firmware/test/test_native/test_event_log.cpp`: + +```cpp +// firmware/test/test_native/test_event_log.cpp +#include +#include +#include "../../src/event_log.h" + +// --- Native NVS stub (declared weak in event_log.cpp for native builds) --- +extern "C" void event_log_test_reset(); + +void setUp() { event_log_test_reset(); } +void tearDown() {} + +void test_entry_is_32_bytes() { + TEST_ASSERT_EQUAL(32, sizeof(EventLogEntry)); +} + +void test_path_hash_is_stable_and_differs() { + uint16_t a = event_log_path_hash("/api/v1/heartbeat"); + uint16_t b = event_log_path_hash("/api/v1/heartbeat"); + uint16_t c = event_log_path_hash("/api/v1/camera/events/batch"); + TEST_ASSERT_EQUAL(a, b); + TEST_ASSERT_NOT_EQUAL(a, c); +} + +void test_write_then_read_recent_returns_newest_first() { + event_log_init(); + event_log_write(EVT_BOOT, 1, 0); + event_log_write(EVT_WIFI_UP, 2, 0); + event_log_write(EVT_HTTP_FAIL, 3, 500); + EventLogEntry buf[8]; + size_t n = event_log_read_recent(buf, 8); + TEST_ASSERT_EQUAL(3, n); + TEST_ASSERT_EQUAL(EVT_HTTP_FAIL, buf[0].tag); + TEST_ASSERT_EQUAL(500, buf[0].data1); + TEST_ASSERT_EQUAL(EVT_WIFI_UP, buf[1].tag); + TEST_ASSERT_EQUAL(EVT_BOOT, buf[2].tag); +} + +void test_ring_buffer_wraps_after_32_entries() { + event_log_init(); + for (int i = 0; i < 40; i++) event_log_write(EVT_HTTP_OK, (uint16_t)i, 0); + EventLogEntry buf[32]; + size_t n = event_log_read_recent(buf, 32); + TEST_ASSERT_EQUAL(32, n); + // Newest first: data0 should be 39, 38, 37, ... down to 8 + TEST_ASSERT_EQUAL(39, buf[0].data0); + TEST_ASSERT_EQUAL(8, buf[31].data0); +} + +int main() { + UNITY_BEGIN(); + RUN_TEST(test_entry_is_32_bytes); + RUN_TEST(test_path_hash_is_stable_and_differs); + RUN_TEST(test_write_then_read_recent_returns_newest_first); + RUN_TEST(test_ring_buffer_wraps_after_32_entries); + return UNITY_END(); +} +``` + +- [ ] **Step 3: Run tests to verify they fail** + +Run: `cd firmware && pio test -e native -f test_event_log` +Expected: FAIL — linker error, event_log.cpp does not exist. + +- [ ] **Step 4: Implement event_log.cpp** + +Create `firmware/src/event_log.cpp`: + +```cpp +// firmware/src/event_log.cpp +#include "event_log.h" +#include + +#ifdef ARDUINO + #include + #include + #include + static Preferences s_prefs; + static const char* NVS_NS = "evlog"; + static const char* NVS_HEAD = "head"; // next write slot (0..31) + static const char* NVS_CNT = "cnt"; // total writes (for seq) +#else + // Native build: in-memory stub + #include + static uint8_t g_slots[32 * 32]; + static uint32_t g_head = 0; + static uint32_t g_cnt = 0; + extern "C" void event_log_test_reset() { + memset(g_slots, 0, sizeof(g_slots)); + g_head = 0; g_cnt = 0; + } +#endif + +static const size_t SLOTS = 32; +static const size_t SLOT_SIZE = sizeof(EventLogEntry); + +uint16_t event_log_path_hash(const char* path) { + // fnv1a-16 (fold 32-bit fnv1a down to 16 bits) + uint32_t h = 0x811c9dc5u; + while (*path) { h ^= (uint8_t)*path++; h *= 0x01000193u; } + return (uint16_t)((h >> 16) ^ (h & 0xFFFF)); +} + +static void slot_write(size_t idx, const EventLogEntry& e) { +#ifdef ARDUINO + char key[8]; snprintf(key, sizeof(key), "s%u", (unsigned)idx); + s_prefs.putBytes(key, &e, SLOT_SIZE); +#else + memcpy(&g_slots[idx * SLOT_SIZE], &e, SLOT_SIZE); +#endif +} + +static bool slot_read(size_t idx, EventLogEntry& e) { +#ifdef ARDUINO + char key[8]; snprintf(key, sizeof(key), "s%u", (unsigned)idx); + size_t n = s_prefs.getBytes(key, &e, SLOT_SIZE); + return n == SLOT_SIZE; +#else + memcpy(&e, &g_slots[idx * SLOT_SIZE], SLOT_SIZE); + return true; +#endif +} + +void event_log_init() { +#ifdef ARDUINO + s_prefs.begin(NVS_NS, /*readOnly=*/false); +#else + // nothing +#endif +} + +void event_log_write(EventLogTag tag, uint16_t data0, uint16_t data1) { + EventLogEntry e = {}; +#ifdef ARDUINO + time_t now = time(nullptr); + e.ts_unix = (now > 1700000000) ? (uint32_t)now : 0; + e.uptime_s = (uint32_t)(millis() / 1000); + uint32_t head = s_prefs.getUInt(NVS_HEAD, 0); + uint32_t cnt = s_prefs.getUInt(NVS_CNT, 0); +#else + e.ts_unix = 0; + e.uptime_s = g_cnt; // stand-in for uptime in native tests + uint32_t head = g_head; + uint32_t cnt = g_cnt; +#endif + e.tag = (uint8_t)tag; + e.data0 = data0; + e.data1 = data1; + e.seq = (uint8_t)(cnt & 0xFF); + slot_write(head % SLOTS, e); +#ifdef ARDUINO + s_prefs.putUInt(NVS_HEAD, (head + 1) % SLOTS); + s_prefs.putUInt(NVS_CNT, cnt + 1); +#else + g_head = (head + 1) % SLOTS; + g_cnt = cnt + 1; +#endif +} + +size_t event_log_read_recent(EventLogEntry* out, size_t max_entries) { +#ifdef ARDUINO + uint32_t head = s_prefs.getUInt(NVS_HEAD, 0); + uint32_t cnt = s_prefs.getUInt(NVS_CNT, 0); +#else + uint32_t head = g_head; + uint32_t cnt = g_cnt; +#endif + size_t available = (cnt < SLOTS) ? cnt : SLOTS; + size_t n = (max_entries < available) ? max_entries : available; + for (size_t i = 0; i < n; i++) { + // newest is at (head - 1), then (head - 2), ... modulo SLOTS + size_t idx = (head + SLOTS - 1 - i) % SLOTS; + slot_read(idx, out[i]); + } + return n; +} +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `cd firmware && pio test -e native -f test_event_log` +Expected: PASS — 4/4 tests, ring-buffer wraps correctly. + +- [ ] **Step 6: Commit** + +```bash +git add firmware/src/event_log.h firmware/src/event_log.cpp firmware/test/test_native/test_event_log.cpp +git commit -m "feat(firmware): add NVS-backed event log ring buffer + +Persistent 32-slot ring buffer of tagged diagnostic events (boot, wifi +up/down, http ok/fail, heartbeat miss, reboot). Used to diagnose field +failures post-hoc via the heartbeat payload, without needing serial +access. Native-native stub lets policy be unit-tested." +``` + +--- + +## Task 2: Wire event_log into boot path + +**Files:** +- Modify: `firmware/src/main.cpp:137-194` (setup function) + +- [ ] **Step 1: Add includes and boot-event logging** + +In `firmware/src/main.cpp`, add after line 10: + +```cpp +#include "event_log.h" +#include +``` + +Replace the opening of `setup()` at line 137-141 with: + +```cpp +void setup() { + Serial.begin(115200); + pinMode(LED_PIN, OUTPUT); + pinMode(BUTTON_PIN, INPUT_PULLUP); + led_set(true); // on = booting + + event_log_init(); + event_log_write(EVT_BOOT, (uint16_t)esp_reset_reason(), 0); +``` + +- [ ] **Step 2: Log reboot cause before each `ESP.restart()`** + +At line 49 (factory reset path), change: +```cpp + config_clear_wifi(); + ESP.restart(); +``` +to: +```cpp + event_log_write(EVT_REBOOT, REBOOT_FACTORY_RESET, 0); + config_clear_wifi(); + ESP.restart(); +``` + +At line 151 (missing wifi creds path), change: +```cpp + provisioning_run(); + ESP.restart(); +``` +to: +```cpp + provisioning_run(); + event_log_write(EVT_REBOOT, REBOOT_WIFI_REPROV, 0); + ESP.restart(); +``` + +Apply the same change at line 163-164 (failed creds path). + +At line 186 (OTA onEnd), change: +```cpp + ArduinoOTA.onEnd([]() { ble_scanner_resume(); ESP.restart(); }); +``` +to: +```cpp + ArduinoOTA.onEnd([]() { + ble_scanner_resume(); + event_log_write(EVT_REBOOT, REBOOT_OTA, 0); + ESP.restart(); + }); +``` + +- [ ] **Step 3: Device build + upload** + +Run: `cd firmware && pio run -e timercam -t upload && pio device monitor` +Expected: serial shows boot; after one more manual power cycle, serial shows no errors. Event log is not yet observable — that happens in Task 8. + +- [ ] **Step 4: Commit** + +```bash +git add firmware/src/main.cpp +git commit -m "feat(firmware): log boot and reboot reason to event log + +Every boot logs EVT_BOOT with esp_reset_reason(); every deliberate +ESP.restart() is preceded by EVT_REBOOT with a reason code. This +gives us a persistent answer to 'why did the device just reboot?'." +``` + +--- + +## Task 3: net_guard module — backoff policy (pure, unit-tested) + +**Files:** +- Create: `firmware/src/net_guard.h` +- Create: `firmware/src/net_guard.cpp` +- Test: `firmware/test/test_native/test_net_guard.cpp` + +- [ ] **Step 1: Write the header** + +Create `firmware/src/net_guard.h`: + +```cpp +// firmware/src/net_guard.h +#pragma once +#include +#include "config.h" + +// Exponential backoff: 1s, 2s, 4s, 8s, 16s, 32s, 60s, 60s, ... +// attempt 0 → 1000ms, clamped at 60000ms. +uint32_t net_guard_next_backoff_ms(uint32_t attempt); + +#ifdef ARDUINO +// Registers WiFi.onEvent() handler and starts auto-reconnect loop. +// Must be called once after WiFi.begin() succeeds. +void net_guard_start(const DeviceConfig& cfg); + +// True iff WiFi is currently associated with IP. +bool net_guard_is_up(); + +// Last disconnect reason code from WIFI_EVENT_STA_DISCONNECTED (0 = none). +uint8_t net_guard_last_disconnect_reason(); +#endif +``` + +- [ ] **Step 2: Write failing tests** + +Create `firmware/test/test_native/test_net_guard.cpp`: + +```cpp +// firmware/test/test_native/test_net_guard.cpp +#include +#include "../../src/net_guard.h" + +void setUp() {} +void tearDown() {} + +void test_backoff_starts_at_one_second() { + TEST_ASSERT_EQUAL(1000, net_guard_next_backoff_ms(0)); +} + +void test_backoff_doubles_each_attempt() { + TEST_ASSERT_EQUAL(2000, net_guard_next_backoff_ms(1)); + TEST_ASSERT_EQUAL(4000, net_guard_next_backoff_ms(2)); + TEST_ASSERT_EQUAL(8000, net_guard_next_backoff_ms(3)); + TEST_ASSERT_EQUAL(16000, net_guard_next_backoff_ms(4)); + TEST_ASSERT_EQUAL(32000, net_guard_next_backoff_ms(5)); +} + +void test_backoff_clamps_at_60s() { + TEST_ASSERT_EQUAL(60000, net_guard_next_backoff_ms(6)); + TEST_ASSERT_EQUAL(60000, net_guard_next_backoff_ms(7)); + TEST_ASSERT_EQUAL(60000, net_guard_next_backoff_ms(100)); +} + +int main() { + UNITY_BEGIN(); + RUN_TEST(test_backoff_starts_at_one_second); + RUN_TEST(test_backoff_doubles_each_attempt); + RUN_TEST(test_backoff_clamps_at_60s); + return UNITY_END(); +} +``` + +- [ ] **Step 3: Run tests to verify they fail** + +Run: `cd firmware && pio test -e native -f test_net_guard` +Expected: FAIL — linker error, `net_guard_next_backoff_ms` undefined. + +- [ ] **Step 4: Implement net_guard.cpp** + +Create `firmware/src/net_guard.cpp`: + +```cpp +// firmware/src/net_guard.cpp +#include "net_guard.h" + +uint32_t net_guard_next_backoff_ms(uint32_t attempt) { + if (attempt >= 6) return 60000; + return 1000u * (1u << attempt); +} + +#ifdef ARDUINO +#include +#include "event_log.h" + +static const DeviceConfig* s_cfg = nullptr; +static volatile uint8_t s_last_disconnect = 0; +static volatile bool s_up = false; +static volatile uint32_t s_attempts = 0; +static volatile uint32_t s_next_retry_ms = 0; + +static void on_wifi_event(WiFiEvent_t event, WiFiEventInfo_t info) { + switch (event) { + case ARDUINO_EVENT_WIFI_STA_GOT_IP: + s_up = true; + s_attempts = 0; + s_next_retry_ms = 0; + event_log_write(EVT_WIFI_UP, (uint16_t)(int16_t)WiFi.RSSI(), 0); + break; + case ARDUINO_EVENT_WIFI_STA_DISCONNECTED: + s_up = false; + s_last_disconnect = (uint8_t)info.wifi_sta_disconnected.reason; + event_log_write(EVT_WIFI_DOWN, s_last_disconnect, 0); + // schedule next retry (backoff based on consecutive attempts) + s_next_retry_ms = millis() + net_guard_next_backoff_ms(s_attempts); + break; + default: break; + } +} + +void net_guard_start(const DeviceConfig& cfg) { + s_cfg = &cfg; + WiFi.onEvent(on_wifi_event); + WiFi.setAutoReconnect(false); // we drive reconnect ourselves +} + +bool net_guard_is_up() { return s_up; } + +uint8_t net_guard_last_disconnect_reason() { return s_last_disconnect; } + +// Called from loop(); non-blocking. +// (Intentionally not in header — only main.cpp calls it.) +extern "C" void net_guard_tick() { + if (s_up || s_cfg == nullptr) return; + if (millis() < s_next_retry_ms) return; + s_attempts++; + WiFi.disconnect(false, false); + WiFi.begin(s_cfg->wifi_ssid.c_str(), s_cfg->wifi_pass.c_str()); + // if STA_DISCONNECTED fires again, on_wifi_event will reschedule + s_next_retry_ms = millis() + net_guard_next_backoff_ms(s_attempts); +} +#endif +``` + +Also add the tick prototype to `net_guard.h` inside the `#ifdef ARDUINO` block, right after `net_guard_last_disconnect_reason`: + +```cpp +extern "C" void net_guard_tick(); +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `cd firmware && pio test -e native -f test_net_guard` +Expected: PASS — 3/3 backoff tests. + +- [ ] **Step 6: Commit** + +```bash +git add firmware/src/net_guard.h firmware/src/net_guard.cpp firmware/test/test_native/test_net_guard.cpp +git commit -m "feat(firmware): event-driven WiFi reconnect with exponential backoff + +net_guard registers WiFi.onEvent() so disconnects are handled +immediately instead of polled every 1s. Backoff 1s→2s→4s→...→60s cap. +Every up/down transition is logged to the event log with the disconnect +reason code, so field failures are diagnosable." +``` + +--- + +## Task 4: Replace polling reconnect in main.cpp with net_guard + +**Files:** +- Modify: `firmware/src/main.cpp:149-167` (setup), `firmware/src/main.cpp:196-210` (loop) + +- [ ] **Step 1: Wire net_guard_start after WiFi connects** + +In `firmware/src/main.cpp`, add `#include "net_guard.h"` with the other includes. + +Replace lines 154-167 with: + +```cpp + WiFi.begin(g_cfg.wifi_ssid.c_str(), g_cfg.wifi_pass.c_str()); + uint32_t wifi_start = millis(); + while (WiFi.status() != WL_CONNECTED && millis() - wifi_start < 15000) { + check_factory_reset(); + delay(200); + } + + if (WiFi.status() != WL_CONNECTED) { + // Saved creds failed — re-provision + provisioning_run(); + event_log_write(EVT_REBOOT, REBOOT_WIFI_REPROV, 0); + ESP.restart(); + } + + net_guard_start(g_cfg); + led_set(false); // off = connected +``` + +- [ ] **Step 2: Replace polling reconnect in loop()** + +Replace the body of `loop()` (lines 196-210) with: + +```cpp +void loop() { + ArduinoOTA.handle(); + check_factory_reset(); + net_guard_tick(); + + static bool s_was_up = true; + bool up = net_guard_is_up(); + if (up != s_was_up) { + led_set(!up); // LED on when NOT up + if (up) reporter_flush(g_cfg); + s_was_up = up; + } + delay(200); +} +``` + +Rationale: removes the 5s blocking `delay()` that was hiding disconnects, removes `WiFi.reconnect()` (net_guard owns reconnection), and flushes buffered records on every UP transition instead of only when the polling loop happens to notice. + +- [ ] **Step 3: Build and upload** + +Run: `cd firmware && pio run -e timercam -t upload && pio device monitor` +Expected: device connects to WiFi, serial shows `[CV]` traffic. Manually kill the AP: serial shows no more traffic trying to post. Restore AP: within ~1-2s, `GOT_IP` event fires and device resumes posting. No long `delay(5000)` stalls. + +- [ ] **Step 4: Commit** + +```bash +git add firmware/src/main.cpp +git commit -m "refactor(firmware): drive WiFi reconnect from net_guard events + +loop() no longer blocks for 5s after a disconnect; reconnect is +scheduled from the WiFi event handler with exponential backoff. +Buffered reports flush on every clean UP transition." +``` + +--- + +## Task 5: HTTP timeouts + retry with backoff in reporter + +**Files:** +- Modify: `firmware/src/reporter.cpp:24-43` (post_json), `firmware/src/reporter.cpp:150-160` (reporter_heartbeat) +- Modify: `firmware/src/reporter.h:20` + +- [ ] **Step 1: Add timeouts and per-attempt event logging to `post_json`** + +In `firmware/src/reporter.cpp`, add `#include "event_log.h"` near the top. + +Replace lines 24-43 (`post_json`) with: + +```cpp +static bool post_json_once(const DeviceConfig& cfg, const char* path, const String& body) { + uint32_t ts = now_ts(); + if (ts < 1700000000UL) return false; + String sig = hmac_sign(cfg.hmac_secret, "POST", path, ts, body); + if (sig.isEmpty()) return false; + + HTTPClient http; + String url = String(REPORTER_API_HOST) + path; + http.begin(url); + http.setConnectTimeout(5000); // DNS + TCP connect + http.setTimeout(10000); // per-transaction response timeout + http.addHeader("Content-Type", "application/json"); + http.addHeader("X-Device-Id", cfg.device_id); + http.addHeader("X-Timestamp", String(ts)); + http.addHeader("X-Signature", sig); + + uint32_t t0 = millis(); + int code = http.POST(body); + uint32_t elapsed = millis() - t0; + http.end(); + uint16_t phash = event_log_path_hash(path); + Serial.printf("[HTTP] POST %s → %d (%u ms)\n", url.c_str(), code, (unsigned)elapsed); + if (code == 200) { + event_log_write(EVT_HTTP_OK, phash, (uint16_t)((elapsed > 65535) ? 65535 : elapsed)); + return true; + } + // code is negative on ESP errno (e.g. -1 connection refused); cast to uint16 preserves low bits + event_log_write(EVT_HTTP_FAIL, phash, (uint16_t)code); + return false; +} + +static bool post_json(const DeviceConfig& cfg, const char* path, const String& body) { + // 3 attempts with 0/2s/5s delays. Total worst case = 2 timeouts (≤15s each) + // + 7s sleep ≈ 37s; acceptable for an hourly task. + static const uint16_t DELAYS_MS[] = { 0, 2000, 5000 }; + for (int i = 0; i < 3; i++) { + if (DELAYS_MS[i]) vTaskDelay(pdMS_TO_TICKS(DELAYS_MS[i])); + if (post_json_once(cfg, path, body)) return true; + } + return false; +} +``` + +- [ ] **Step 2: Make `reporter_heartbeat` return status** + +In `firmware/src/reporter.h`, change line 20 from: +```cpp +void reporter_heartbeat(const DeviceConfig& cfg, uint32_t uptime_s, int wifi_rssi); +``` +to: +```cpp +bool reporter_heartbeat(const DeviceConfig& cfg, uint32_t uptime_s, int wifi_rssi); +``` + +In `firmware/src/reporter.cpp`, change the signature at line 150 and the final line from: +```cpp +void reporter_heartbeat(const DeviceConfig& cfg, uint32_t uptime_s, int wifi_rssi) { + ... + post_json(cfg, "/api/v1/heartbeat", body); +} +``` +to: +```cpp +bool reporter_heartbeat(const DeviceConfig& cfg, uint32_t uptime_s, int wifi_rssi) { + ... + return post_json(cfg, "/api/v1/heartbeat", body); +} +``` + +- [ ] **Step 3: Build + upload + smoke test** + +Run: `cd firmware && pio run -e timercam -t upload && pio device monitor` +Expected: normal heartbeats show `[HTTP] POST ... → 200 (xxx ms)` with an elapsed time ≤10s. Simulate a timeout by blocking `logs.research.bike` at the router: serial shows 3 attempts with backoff between them, then the next hourly cycle retries fresh. + +- [ ] **Step 4: Commit** + +```bash +git add firmware/src/reporter.h firmware/src/reporter.cpp +git commit -m "fix(firmware): add HTTP timeouts + 3-try retry, report heartbeat status + +Unbounded TLS/HTTP POSTs were blocking the reporter task indefinitely +on weak WiFi. Now: 5s connect timeout, 10s response timeout, 3 attempts +with 0/2s/5s backoff. Every attempt logs HTTP_OK or HTTP_FAIL to the +event log. reporter_heartbeat now returns bool so the caller can count +consecutive misses." +``` + +--- + +## Task 6: Enable ESP-IDF Task Watchdog (TWDT) on camera and reporter tasks + +**Files:** +- Modify: `firmware/src/main.cpp` (setup tail + both task bodies + loop) + +- [ ] **Step 1: Enable TWDT in setup** + +Add `#include ` with the other includes in `firmware/src/main.cpp`. + +Immediately before the `xTaskCreatePinnedToCore` calls at line 192, add: + +```cpp + // Task watchdog: 30s timeout, panic on trigger so we reboot and log cause. + esp_task_wdt_init(30, /*panic=*/true); + esp_task_wdt_add(nullptr); // subscribe the Arduino loopTask too +``` + +- [ ] **Step 2: Subscribe and feed the camera task** + +Modify `task_camera` (around line 56). After the function opens and before the `while (true)` loop, add: + +```cpp + esp_task_wdt_add(nullptr); +``` + +Inside the `while (true)` loop, immediately after `vTaskDelay(pdMS_TO_TICKS(CAM_INTERVAL_MS));` (line 84), add: + +```cpp + esp_task_wdt_reset(); +``` + +- [ ] **Step 3: Subscribe and feed the reporter task** + +Modify `task_reporter` (around line 89). After the function opens and before `while (true)`, add: + +```cpp + esp_task_wdt_add(nullptr); +``` + +Inside the loop, immediately after `vTaskDelay(pdMS_TO_TICKS(10000));` (line 93), add: + +```cpp + esp_task_wdt_reset(); +``` + +(Note: the 10s tick is < 30s TWDT timeout, so feeding here is sufficient when no POST is in flight. During a POST the 3-retry sequence is capped at ~37s, longer than TWDT. Task 7 addresses that gap explicitly by feeding mid-POST.) + +- [ ] **Step 4: Feed TWDT from loop()** + +At the top of `loop()`, add: + +```cpp + esp_task_wdt_reset(); +``` + +- [ ] **Step 5: Feed TWDT between retry attempts in reporter** + +In `firmware/src/reporter.cpp`, add `#include ` near the top. + +Inside `post_json`, before the `vTaskDelay` in the retry loop, add: + +```cpp + esp_task_wdt_reset(); +``` + +And also call `esp_task_wdt_reset()` immediately after each `http.end()` in `post_json_once`. + +- [ ] **Step 6: Build + upload + verify** + +Run: `cd firmware && pio run -e timercam -t upload && pio device monitor` +Expected: device runs normally. Force a hang by sticking a `delay(60000)` temporarily inside `task_camera` — serial should show TWDT timeout panic + reboot, and on next boot `EVT_BOOT` logs `ESP_RST_TASK_WDT` as reset reason. Remove the test delay before committing. + +- [ ] **Step 7: Commit** + +```bash +git add firmware/src/main.cpp firmware/src/reporter.cpp +git commit -m "feat(firmware): enable task watchdog on camera/reporter/loop tasks + +30s TWDT subscribes all three long-running tasks and panics on hang. +Reporter task explicitly feeds between POST retries so the 3-try retry +sequence does not itself trip the dog. Reset reason on next boot is +visible via esp_reset_reason() logged to event log." +``` + +--- + +## Task 7: Software heartbeat-miss watchdog + +**Files:** +- Modify: `firmware/src/main.cpp` (task_reporter) + +- [ ] **Step 1: Track consecutive heartbeat failures and reboot at 6** + +In `firmware/src/main.cpp`, inside `task_reporter`, replace the block that currently reads (around lines 128-133): + +```cpp + reporter_submit_camera(g_cfg, cam_rec); + reporter_submit_ble(g_cfg, ble_rec); + reporter_heartbeat(g_cfg, millis() / 1000, WiFi.RSSI()); + + ble_scanner_reinit(); + led_set(false); +``` + +with: + +```cpp + reporter_submit_camera(g_cfg, cam_rec); + reporter_submit_ble(g_cfg, ble_rec); + bool hb_ok = reporter_heartbeat(g_cfg, millis() / 1000, WiFi.RSSI()); + + ble_scanner_reinit(); + led_set(false); + + static uint8_t consecutive_misses = 0; + if (hb_ok) { + consecutive_misses = 0; + } else { + consecutive_misses++; + event_log_write(EVT_HEARTBEAT_MISS, consecutive_misses, 0); + Serial.printf("[WDG] heartbeat miss %u/6\n", consecutive_misses); + // 6 misses × 1h = 6h offline → reboot and try from scratch. + if (consecutive_misses >= 6) { + event_log_write(EVT_REBOOT, REBOOT_HEARTBEAT_MISS, 0); + delay(200); // give NVS flush time + ESP.restart(); + } + } +``` + +Rationale: 6 hours offline is the point where a clean reboot is cheaper than continuing to accumulate miss state. Lower values risk reboot loops during transient ISP outages. + +- [ ] **Step 2: Build + upload + verify** + +Run: `cd firmware && pio run -e timercam -t upload && pio device monitor` +Expected: normal operation, heartbeats succeed, no misses logged. Temporarily block API host to force 6 consecutive misses over shortened intervals (you can shrink `REPORT_INTERVAL_S` to 60 for this test only), and confirm device reboots after the 6th miss with `REBOOT_HEARTBEAT_MISS` in the event log. Restore `REPORT_INTERVAL_S = 3600` before committing. + +- [ ] **Step 3: Commit** + +```bash +git add firmware/src/main.cpp +git commit -m "feat(firmware): software heartbeat-miss watchdog reboots after 6h offline + +Reporter task counts consecutive heartbeat failures. After 6 misses +(~6 hours) the device logs the reason and reboots, giving the whole +network stack a clean reinitialization. Reset via ESP.restart() is +logged before the reboot so the next boot's heartbeat can report it." +``` + +--- + +## Task 8: Extend heartbeat payload with diagnostic fields + +**Files:** +- Modify: `firmware/src/reporter.cpp:150-160` (reporter_heartbeat) + +- [ ] **Step 1: Include heap, reset reason, last disconnect, and recent events** + +In `firmware/src/reporter.cpp`, add these includes near the top: + +```cpp +#include +#include +#include "event_log.h" +#include "net_guard.h" +``` + +Replace `reporter_heartbeat` (now returning `bool` from Task 5) with: + +```cpp +bool reporter_heartbeat(const DeviceConfig& cfg, uint32_t uptime_s, int wifi_rssi) { + JsonDocument doc; + doc["device_id"] = cfg.device_id; + doc["firmware_version"] = "1.1.0"; + doc["free_storage_pct"] = 100; + doc["wifi_rssi"] = wifi_rssi; + doc["pending_records"] = (int)(s_cam_buf.size() + s_ble_buf.size()); + doc["uptime_seconds"] = uptime_s; + + // Diagnostics + doc["reset_reason"] = (int)esp_reset_reason(); + doc["heap_free"] = (int)esp_get_free_heap_size(); + doc["heap_min_free"] = (int)esp_get_minimum_free_heap_size(); + doc["last_disconnect_code"] = (int)net_guard_last_disconnect_reason(); + + // Last 8 event-log entries, newest first + EventLogEntry recent[8]; + size_t n = event_log_read_recent(recent, 8); + JsonArray evs = doc["recent_events"].to(); + for (size_t i = 0; i < n; i++) { + JsonObject e = evs.add(); + e["t"] = recent[i].tag; + e["d0"] = recent[i].data0; + e["d1"] = recent[i].data1; + e["ts"] = recent[i].ts_unix; + e["up"] = recent[i].uptime_s; + } + + String body; serializeJson(doc, body); + return post_json(cfg, "/api/v1/heartbeat", body); +} +``` + +- [ ] **Step 2: Bump firmware version comment** + +Also update `doc["firmware_version"] = "1.0.0"` comment/version string to `"1.1.0"` (done above). + +- [ ] **Step 3: Build + upload + verify on server side** + +Run: `cd firmware && pio run -e timercam -t upload && pio device monitor` +Expected: next heartbeat payload (inspect on the server side or via a local `curl` replay) includes `heap_free`, `heap_min_free`, `reset_reason`, `last_disconnect_code`, `recent_events[]`. Power-cycle the device and confirm the first heartbeat after reboot has `reset_reason != 1` (i.e. not a power-on reset) only when the reboot was triggered deliberately. + +- [ ] **Step 4: Commit** + +```bash +git add firmware/src/reporter.cpp +git commit -m "feat(firmware): include diagnostics in heartbeat payload + +Heartbeat now carries heap stats, esp_reset_reason(), last WiFi +disconnect code, and the last 8 persisted event-log entries. Makes +field failures diagnosable server-side without retrieving the device." +``` + +--- + +## Task 9: Update server to accept + log new heartbeat fields (non-breaking) + +**Files:** +- Modify: `server/` heartbeat endpoint handler (path depends on server framework — run `grep -r "/api/v1/heartbeat" server/` to locate) + +- [ ] **Step 1: Locate the heartbeat handler** + +Run: `grep -rn "/api/v1/heartbeat" /home/peter/Projects/DoorCounter/server/` +Expected: one handler file. Open it. + +- [ ] **Step 2: Accept the new optional fields without rejecting old payloads** + +Add the new fields to the request schema as optional (nullable). In most frameworks this is a 2-4 line diff — do not fail validation if they are absent. + +Persist them to whatever storage the handler already uses (SQLite/postgres/logfile). A reasonable column set: +- `reset_reason INT NULL` +- `heap_free INT NULL` +- `heap_min_free INT NULL` +- `last_disconnect_code INT NULL` +- `recent_events JSON NULL` (or TEXT if JSON column type unavailable) + +If a migration is needed, add it. If the server stores heartbeats as free-form JSON, no migration is required. + +- [ ] **Step 3: Smoke-test with curl** + +Run the exact request shape the firmware sends. Example payload: +```json +{ + "device_id": "dc-0002", + "firmware_version": "1.1.0", + "wifi_rssi": -62, + "uptime_seconds": 123, + "reset_reason": 1, + "heap_free": 180000, + "heap_min_free": 150000, + "last_disconnect_code": 0, + "recent_events": [{"t":1,"d0":1,"d1":0,"ts":0,"up":0}] +} +``` + +Expected: 200 OK with correct HMAC. Query storage to confirm the new fields landed. + +- [ ] **Step 4: Commit** + +```bash +git add server/ +git commit -m "feat(server): persist firmware diagnostic fields from heartbeat + +Accepts reset_reason, heap_free, heap_min_free, last_disconnect_code, +and recent_events as optional fields. Older firmwares without these +fields continue to work unchanged." +``` + +--- + +## Task 10: Field deployment checklist + +**Files:** +- Modify: `README.md` (append deployment section) + +- [ ] **Step 1: Document the deployment procedure** + +Append to `README.md` under a new `## Deploying firmware 1.1 (network resilience)` section: + +- Flash command: `cd firmware && pio run -e timercam -t upload` +- Expected first boot log line: `EVT_BOOT reason=1 (power-on)` or `reason=16 (brownout)` if power supply is suspect. +- First heartbeat expected ~70s after power-on (15s WiFi + NTP + 60s BOOT_REPORT_DELAY_S). +- What to check on the server after 24 hours in the field: + - Heartbeat count ≥ 22 (≥ 92% uptime) + - No `recent_events` entry with `t=6` (EVT_HEARTBEAT_MISS) beyond transient singletons + - `heap_min_free` not drifting downward day over day (indicates leak) + - `last_disconnect_code` matches known AP behavior (8 = assoc lost, 15 = 4-way handshake timeout are common; investigate if repeating) + +- [ ] **Step 2: Commit** + +```bash +git add README.md +git commit -m "docs: network-resilience firmware deployment + field diagnostic guide" +``` + +--- + +## Verification + +After all tasks land: + +- [ ] Native tests pass: `cd firmware && pio test -e native` → all green +- [ ] Device flashed: `pio run -e timercam -t upload` +- [ ] Boot produces `EVT_BOOT` entry visible in next heartbeat payload +- [ ] Manually killing AP on router triggers `EVT_WIFI_DOWN` with reason code within ~1s (not 5s) +- [ ] Restoring AP triggers `EVT_WIFI_UP` and buffered records flush automatically +- [ ] Blocking API host forces `EVT_HTTP_FAIL` entries; after 6 hours device auto-reboots with `REBOOT_HEARTBEAT_MISS` +- [ ] No change to CV pipeline output (`[CV] entry +1` / `[CV] exit +1` still present at same cadence) diff --git a/docs/superpowers/plans/2026-05-01-ble-nimble-crash.md b/docs/superpowers/plans/2026-05-01-ble-nimble-crash.md new file mode 100644 index 0000000..baacfbe --- /dev/null +++ b/docs/superpowers/plans/2026-05-01-ble-nimble-crash.md @@ -0,0 +1,189 @@ +# BLE / NimBLE Timer-Callout Crash — Handoff + +**Date opened:** 2026-05-01 +**Status:** Resolved 2026-05-01 by upgrading `h2zero/NimBLE-Arduino` from `^1.4.2` to `^2.0.0` (`firmware/platformio.ini:24`). BLE scanning re-enabled via `BLE_SCANNING_ENABLED 1` (`firmware/src/main.cpp:30`). Verified clean on customer network for 1+ hour with no panics. +**Goal:** Re-enable BLE scanning without the device crashing within ~1s of boot. + +**Confirmed root cause:** Instrumented `os_callout_timer_cb` with `ets_printf` and observed the very first callout fire on the direct-call path with both `evq=NULL` and `fn=NULL`, while the same `co` address later (post-init) showed valid `evq` and `fn`. Same callout struct reused — classic NimBLE 1.x callout init/fire race where the FreeRTOS `TimerHandle_t` had a queued expiry against a not-yet-initialized event. NimBLE 2.x rewrote the porting layer; the race is gone. + +**Migration touches (NimBLE 1.x → 2.x):** +- `NimBLEAdvertisedDeviceCallbacks` → `NimBLEScanCallbacks` +- `onResult(NimBLEAdvertisedDevice*)` → `onResult(const NimBLEAdvertisedDevice*)` +- `setAdvertisedDeviceCallbacks(cb, true)` → `setScanCallbacks(cb, true)` +- `start(0, nullptr, false)` → `start(0, false, false)` (signature: `duration, isContinue, restart`) + +BLE was working before today's customer-site provisioning trip. The crash is reliably reproducible on the current build at the customer location whenever `BLE_SCANNING_ENABLED` is set back to `1`. It may or may not reproduce on a quieter network — the camera task's CPU-starvation pattern is shared, but the crash window's exact trigger is unconfirmed. + +--- + +## Symptom + +Within ~1s of boot, after several `cam_hal: EV-VSYNC-OVF` lines from the camera driver: + +``` +Guru Meditation Error: Core 0 panic'ed (InstrFetchProhibited). Exception was unhandled. + +Core 0 register dump: +PC : 0x00000000 PS : 0x00060630 A0 : 0x8009a9af A1 : 0x3ffbd6e0 +A2 : 0x3fff1ef8 A3 : 0x00000001 ... +A8 : 0x800f2ebc ... +EXCCAUSE: 0x00000014 EXCVADDR: 0x00000000 + +Backtrace: 0xfffffffd:0x3ffbd6e0 0x4009a9ac:0x3ffbd700 +``` + +Decoded with `~/.platformio/packages/toolchain-xtensa-esp32/bin/xtensa-esp32-elf-addr2line -e .pio/build/timercam/firmware.elf -pfiC 0x4009a9ac 0x400f2ebc`: + +``` +prvProcessReceivedCommands at freertos/timers.c:852 + (inlined by) prvTimerTask at freertos/timers.c:600 +os_callout_timer_cb at NimBLE-Arduino/.../npl_os_freertos.c:1742 +``` + +`PC=0` + `EXCCAUSE=0x14` (InstrFetchProhibited) = jump-to-NULL. The FreeRTOS timer-service task is dispatching a NimBLE callout whose callback function pointer is NULL. + +The relevant NimBLE source: + +```c +// firmware/.pio/libdeps/timercam/NimBLE-Arduino/src/nimble/porting/npl/freertos/src/npl_os_freertos.c:1729-1742 +static void +os_callout_timer_cb(TimerHandle_t timer) +{ + struct ble_npl_callout *co; + + co = pvTimerGetTimerID(timer); + assert(co); + + if (co->evq) { + ble_npl_eventq_put(co->evq, &co->ev); + } else { + co->ev.fn(&co->ev); // <-- co->ev.fn is NULL + } +} +``` + +Either `co->ev.fn` is genuinely NULL on the direct-call path, OR — given the addr2line frame is a few lines off and the callsite is ambiguous — the FreeRTOS timer's own callback pointer (`pxTimer->pxCallbackFunction`) is NULL inside `prvProcessReceivedCommands`. Both indicate a callout/timer being freed or zeroed while the FreeRTOS timer service still has a command queued for it. + +--- + +## Environment + +- Board: M5Stack TimerCam-F (ESP32-D0WDQ6-V3, dual-core 240 MHz, 4MB flash). +- BLE library: `h2zero/NimBLE-Arduino@^1.4.2` (`firmware/platformio.ini`). 1.4.2 is end-of-life on the 1.x branch; 2.x exists with breaking API changes. +- Camera: OV3660 via `esp32-camera` driver, 96×96 grayscale @ 5 FPS. +- BLE scan: passive, low-overhead, hash-collected by `firmware/src/ble_scanner.cpp`. +- Tasks: `task_camera` (core 1, prio 2, 8KB stack), `task_reporter` (core 0, prio 1, 8KB stack), Arduino loop (default). +- The camera task triggers `cam_hal: EV-VSYNC-OVF` whenever frame capture overlaps another long operation — this consistently precedes the crash in logs. + +--- + +## What's been ruled out + +1. **DNS / network code** — entirely unrelated. DNS path works in production via the new fallback-IP machinery (`firmware/src/reporter.cpp` `resolve_api_ip` and `firmware/src/reporter.h` `REPORTER_API_FALLBACK_IP`). Do not regress this; it shipped with reports working at the customer site. +2. **Our BLE app code** — the backtrace stays inside the FreeRTOS timer service and NimBLE's own porting layer; nothing in `ble_scanner.cpp` is on the call stack. The bug is in vendored NimBLE. +3. **Memory corruption from our side** — `A2 = 0x3fff1ef8` is a normal heap address, no obvious overrun pattern. Heap is healthy at the time (we'd see a different fault otherwise). +4. **Stack overflow** — A1 = 0x3ffbd6e0 is well within the FreeRTOS timer-service task's stack range; no canary smash log. + +--- + +## What changed today + +| File | Change | Keep? | +|---|---|---| +| `firmware/src/main.cpp` | Added `BLE_SCANNING_ENABLED 0` gate; all `ble_scanner_*` callsites compile out; `BLEHourlyRecord` zero-stubbed when off | Keep until crash fixed; flip to `1` to reproduce | +| `firmware/src/main.cpp` | Removed verbose `[F]`/`[CV] spawn` per-frame logging; kept entry/exit + heartbeat | Keep | +| `firmware/src/ble_scanner.cpp` | Removed `[BLE] new device:` per-discovery log | Keep | +| `firmware/src/reporter.{h,cpp}` | DNS resolution with fallback IP, raw `WiFiClient` HTTP, manual `Host:` header | Keep — production fix | +| `firmware/lib/net_guard/net_guard.{h,cpp}` | DNS pin to 1.1.1.1/8.8.8.8 at lwIP + esp-netif layers; `net_guard_dump_dns` diagnostic | Keep | + +--- + +## Reproduction + +1. `cd firmware && pio run -e timercam`. +2. Edit `firmware/src/main.cpp`, set `#define BLE_SCANNING_ENABLED 1`. Rebuild. +3. Flash a TimerCam: `python tools/flash_device.py --port /dev/ttyUSB0 --device-id dc-XXXX --location-id --hmac-secret --wifi-ssid "" --wifi-password ""`. +4. `pio device monitor --port /dev/ttyUSB0 --baud 115200`. +5. Wait ≤30s. Expect the `Guru Meditation Error: Core 0 panic'ed (InstrFetchProhibited)` traceback above. + +Crash is **deterministic** on the customer's network (Elly-Fi). Worth retesting on a quiet desk network — if it doesn't repro there, the trigger is camera-task starvation interacting with NimBLE timers, not a pure NimBLE bug. + +To decode any future crash backtrace: + +```sh +~/.platformio/packages/toolchain-xtensa-esp32/bin/xtensa-esp32-elf-addr2line \ + -e firmware/.pio/build/timercam/firmware.elf -pfiC ... +``` + +--- + +## Investigation paths, in order of effort/confidence + +### 1. Confirm the failing call site (cheap, do this first) + +The addr2line line numbers can be off by ±3 due to inlining. Add a temporary `Serial.printf` patch to `npl_os_freertos.c` `os_callout_timer_cb` to log `co`, `co->evq`, `co->ev.fn` on entry. Reproduce. Then we know with certainty whether `co->ev.fn` is NULL on the direct-call path or whether this is an FreeRTOS-level issue (queued command for a deleted timer). + +If `evq != NULL` and we still crash, the NULL is in the queued event dispatcher (a different code path; pivot the investigation). + +### 2. Try upgrading NimBLE-Arduino to 2.x (medium effort, likely-fix) + +`platformio.ini` has `h2zero/NimBLE-Arduino@^1.4.2`. 2.x rewrote the porting layer significantly. Breaking API changes — `NimBLEAdvertisedDeviceCallbacks` was renamed/restructured. Touch points: `firmware/src/ble_scanner.cpp` (the only file that uses NimBLE). + +Try: pin `^2.0.0`, fix the API breakage in `ble_scanner.cpp` (it's <100 lines). If 2.x crashes too, the issue is independent of NimBLE version → pivot to (3) or (4). + +### 3. Reduce camera-task starvation (cheap, may be sufficient) + +The `EV-VSYNC-OVF` lines are the canary. The camera task pins core 1 at priority 2 doing CV processing every 200ms. NimBLE host task runs on core 0 by default but the FreeRTOS timer service task is core-agnostic and may be starved during long CV passes that hold a mutex. + +Things to try in `firmware/src/main.cpp`: +- Lower `CAM_FPS` from 5 to 3, see if VSYNC-OVF still appears. +- Move CV processing off the capture path (capture into a queue, process at lower priority). +- Raise FreeRTOS timer-service task priority via `configTIMER_TASK_PRIORITY` (sdkconfig). +- Confirm NimBLE host task pinning — `CONFIG_BT_NIMBLE_PINNED_TO_CORE` should be 0 or 1 (not unpinned). + +### 4. Local NULL-guard patch (last resort, masks the bug) + +If upgrade is blocked and starvation reduction isn't enough, patch the vendored source: + +```c +// npl_os_freertos.c:1740 +} else { + if (co->ev.fn) co->ev.fn(&co->ev); +} +``` + +This silences the crash but drops the dropped event. The dropped events are likely scan-result deliveries; we'd undercount BLE devices but not crash. Acceptable as a stopgap with a `// TODO: remove when NimBLE upgraded` and a note in this doc. + +Caveat: vendored library files in `.pio/libdeps/` get blown away by clean builds. Either copy NimBLE into `firmware/lib/` and pin it (vendored), or use `lib_archive` + a post-install script. Don't ship a build that depends on an unpinned hand-edit. + +### 5. Replace BLE stack (high effort) + +If 2.x also crashes and starvation reduction doesn't help, switch to the IDF-native bluedroid stack via the Arduino-ESP32 `BLEDevice` API. Larger memory footprint (~30KB more heap) but a different lifecycle model — won't share NimBLE's bug. + +--- + +## Constraints / things not to break + +- `firmware/src/reporter.cpp` DNS path with `REPORTER_API_FALLBACK_IP` — production fix, must keep working. Do not regress to `HTTPClient`. +- `BLE_SCANNING_ENABLED 0` is the **shipping default** until this is resolved. Devices in the field rely on this; flip to `1` only in dev builds. +- `firmware/lib/net_guard/net_guard.cpp` `net_guard_pin_dns()` is called both at boot and on every WiFi reconnect; if reorganizing net_guard, preserve both call sites. +- The `ble_scanner` module supports `ble_scanner_pause`/`resume` for OTA — verify it still works after any NimBLE upgrade (`ArduinoOTA.onStart` hook in `main.cpp:248`). + +--- + +## Open questions + +- Does the crash repro on a quiet network with no `EV-VSYNC-OVF`? (Determines whether starvation is necessary vs sufficient.) +- Was BLE working in a previous build, and on which NimBLE version? Earliest BLE-related commit traced to is well before today; binary search across firmware commits with BLE enabled would identify the regression boundary if it's our code. +- Does the customer site have an unusual RF environment (very dense BLE) that increases the callout-churn rate, making the race more likely? Worth a `nimble_scan_event` count log during a 60s capture window. + +--- + +## Quick verification once you think it's fixed + +1. Set `BLE_SCANNING_ENABLED 1`, rebuild, flash. +2. Run for at least 10 minutes on the customer network — the original crash hit within ~1s, so 10 min with no panic is strong evidence. +3. Confirm a successful hourly cycle: `[CV] entry/exit`, then `[HTTP] POST .../events/batch ... -> 200`, BLE record with non-zero `unique_devices`. +4. Run a second device side-by-side; confirm no cross-device interference. + +When done, set `BLE_SCANNING_ENABLED 1` as the default and remove the gate (keep the comment block as institutional memory of the bug). diff --git a/firmware/lib/net_guard/net_guard.cpp b/firmware/lib/net_guard/net_guard.cpp index 36b4f3e..a1d5ca1 100644 --- a/firmware/lib/net_guard/net_guard.cpp +++ b/firmware/lib/net_guard/net_guard.cpp @@ -9,8 +9,66 @@ uint32_t net_guard_next_backoff_ms(uint32_t attempt) { #ifdef ARDUINO #include "config.h" #include +#include +#include +#include #include "event_log.h" +// Both lwIP's ip_addr_t and esp-netif's esp_ip_addr_t alias the same on-disk +// layout for IPv4, but the C++ types differ. Take the raw u32 to sidestep it. +static String fmt_v4(uint32_t addr_be) { + if (addr_be == 0) return String("0.0.0.0"); + char b[16]; + snprintf(b, sizeof(b), "%u.%u.%u.%u", + (unsigned)((addr_be >> 0) & 0xFF), + (unsigned)((addr_be >> 8) & 0xFF), + (unsigned)((addr_be >> 16) & 0xFF), + (unsigned)((addr_be >> 24) & 0xFF)); + return String(b); +} + +void net_guard_dump_dns(const char* tag) { + const ip_addr_t* d0 = dns_getserver(0); + const ip_addr_t* d1 = dns_getserver(1); + Serial.printf("[DNS] %s lwip: %s , %s\n", tag, + fmt_v4(d0 ? ip_2_ip4(d0)->addr : 0).c_str(), + fmt_v4(d1 ? ip_2_ip4(d1)->addr : 0).c_str()); + + esp_netif_t* sta = esp_netif_get_handle_from_ifkey("WIFI_STA_DEF"); + if (sta) { + esp_netif_dns_info_t main_dns{}, backup_dns{}; + esp_netif_get_dns_info(sta, ESP_NETIF_DNS_MAIN, &main_dns); + esp_netif_get_dns_info(sta, ESP_NETIF_DNS_BACKUP, &backup_dns); + Serial.printf("[DNS] %s netif: %s , %s\n", tag, + fmt_v4(main_dns.ip.u_addr.ip4.addr).c_str(), + fmt_v4(backup_dns.ip.u_addr.ip4.addr).c_str()); + } else { + Serial.printf("[DNS] %s netif: \n", tag); + } +} + +void net_guard_pin_dns() { + ip_addr_t d1, d2; + IP_ADDR4(&d1, 1, 1, 1, 1); + IP_ADDR4(&d2, 8, 8, 8, 8); + dns_setserver(0, &d1); + dns_setserver(1, &d2); + + // Also push through the esp_netif layer. dns_setserver() writes the + // global lwIP table directly; esp_netif_set_dns_info() is what the + // DHCP client itself calls, so writing here prevents the next DHCP + // event from silently overwriting our pin. + esp_netif_t* sta = esp_netif_get_handle_from_ifkey("WIFI_STA_DEF"); + if (sta) { + esp_netif_dns_info_t info{}; + IP_ADDR4(&info.ip, 1, 1, 1, 1); + esp_netif_set_dns_info(sta, ESP_NETIF_DNS_MAIN, &info); + IP_ADDR4(&info.ip, 8, 8, 8, 8); + esp_netif_set_dns_info(sta, ESP_NETIF_DNS_BACKUP, &info); + } + net_guard_dump_dns("pinned"); +} + // Shared with the WiFi event task. 32-bit aligned loads/stores are atomic on // Xtensa; volatile suffices. Tick re-evaluates every loop iteration, so stale // reads self-correct within ~200ms. @@ -23,6 +81,11 @@ static volatile uint32_t s_next_retry_ms = 0; static void on_wifi_event(WiFiEvent_t event, WiFiEventInfo_t info) { switch (event) { case ARDUINO_EVENT_WIFI_STA_GOT_IP: + // Override DHCP-supplied DNS. Some routers return TC=1 for short + // answers (forcing TCP fallback that lwIP can't follow), or hand + // out an unreachable resolver. Pin to public resolvers so + // hostByName() never depends on the local network's DNS quality. + net_guard_pin_dns(); s_up = true; s_attempts = 0; s_next_retry_ms = 0; diff --git a/firmware/lib/net_guard/net_guard.h b/firmware/lib/net_guard/net_guard.h index 9b9fc66..4ae84b1 100644 --- a/firmware/lib/net_guard/net_guard.h +++ b/firmware/lib/net_guard/net_guard.h @@ -21,4 +21,13 @@ uint8_t net_guard_last_disconnect_reason(); // Non-blocking tick called from loop(); kicks reconnect if due. extern "C" void net_guard_tick(); + +// Override DHCP-supplied DNS with public resolvers (1.1.1.1, 8.8.8.8). +// Idempotent; safe to call repeatedly. net_guard re-applies on every GOT_IP, +// but main.cpp must call it once for the boot association (which completes +// before net_guard_start() registers its event handler). +void net_guard_pin_dns(); + +// Diagnostic: print current DNS table state from both lwIP and esp_netif. +void net_guard_dump_dns(const char* tag); #endif diff --git a/firmware/platformio.ini b/firmware/platformio.ini index 1f782c8..a6bc870 100644 --- a/firmware/platformio.ini +++ b/firmware/platformio.ini @@ -21,7 +21,7 @@ upload_flags = --no-stub lib_deps = tzapu/WiFiManager@^2.0.17 bblanchon/ArduinoJson@^7.0.0 - h2zero/NimBLE-Arduino@^1.4.2 + h2zero/NimBLE-Arduino@^2.0.0 espressif/esp32-camera ; Frame-capture build. Strips WiFi/BLE/CV/reporter; streams raw 96x96 frames diff --git a/firmware/src/ble_scanner.cpp b/firmware/src/ble_scanner.cpp index a7525d4..9629679 100644 --- a/firmware/src/ble_scanner.cpp +++ b/firmware/src/ble_scanner.cpp @@ -42,8 +42,8 @@ static String sha256_prefix(const String& input) { return hex; } -class ScanCallback : public NimBLEAdvertisedDeviceCallbacks { - void onResult(NimBLEAdvertisedDevice* dev) override { +class ScanCallback : public NimBLEScanCallbacks { + void onResult(const NimBLEAdvertisedDevice* dev) override { String mac = String(dev->getAddress().toString().c_str()); String hash = sha256_prefix(mac); int rssi = dev->getRSSI(); @@ -51,7 +51,6 @@ class ScanCallback : public NimBLEAdvertisedDeviceCallbacks { std::lock_guard lock(s_mutex); auto it = s_seen.find(hash); if (it == s_seen.end()) { - Serial.printf("[BLE] new device: %s (rssi %d)\n", hash.c_str(), rssi); s_seen[hash] = {rssi, 1}; } else { it->second.rssi_sum += rssi; @@ -68,16 +67,16 @@ static NimBLEScan* s_scan = nullptr; void ble_scanner_start() { NimBLEDevice::init(""); s_scan = NimBLEDevice::getScan(); - s_scan->setAdvertisedDeviceCallbacks(&s_callback, true); // true = allow duplicates + s_scan->setScanCallbacks(&s_callback, true); // true = allow duplicates s_scan->setActiveScan(false); // passive s_scan->setInterval(100); s_scan->setWindow(99); s_scan->setMaxResults(0); // don't store results — callback-only - s_scan->start(0, nullptr, false); // 0 = continuous + s_scan->start(0, false, false); // duration=0 (forever), isContinue=false, restart=false } void ble_scanner_pause() { if (s_scan) s_scan->stop(); } -void ble_scanner_resume() { if (s_scan) s_scan->start(0, nullptr, false); } +void ble_scanner_resume() { if (s_scan) s_scan->start(0, false, false); } void ble_scanner_deinit() { if (s_scan) s_scan->stop(); diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp index 765ac6c..5df1d28 100644 --- a/firmware/src/main.cpp +++ b/firmware/src/main.cpp @@ -19,6 +19,15 @@ #define BUTTON_PIN 37 #define FACTORY_RESET_HOLD_MS 5000 +// BLE scanning disabled in production until the NimBLE-Arduino 1.4.2 timer +// race is resolved. Symptom: FreeRTOS timer task dispatches an +// os_callout_timer_cb whose callback fn is NULL, causing PC=0 fetch and +// Historical note: NimBLE-Arduino 1.4.2 had an init/fire race in its FreeRTOS +// callout porting layer that caused a NULL-fn dispatch (PC=0, +// InstrFetchProhibited) within ~1s of boot when the camera task starved the +// timer service. Fixed by upgrading to 2.x (see platformio.ini). +#define BLE_SCANNING_ENABLED 1 + #define CAM_FPS 5 #define CAM_INTERVAL_MS (1000 / CAM_FPS) #define REPORT_INTERVAL_S 3600 @@ -67,16 +76,7 @@ static void task_camera(void*) { if (camera_capture_96(frame)) { if (xSemaphoreTake(s_cv_mutex, pdMS_TO_TICKS(100)) == pdTRUE) { CVResult r = cv_process(g_cv, frame, g_cfg.line_offset); - for (const auto& t : g_cv.tracks) { - if (t.id > last_logged_track_id) { - last_logged_track_id = t.id; - Serial.printf("[CV] spawn id=%d y=%.1f\n", t.id, t.spawn_y); - } - } - if (r.fg_count > 0) { - Serial.printf("[F] n=%d y=%d..%d c=%.1f\n", - r.fg_count, r.fg_min_y, r.fg_max_y, r.fg_centroid_y); - } + (void)last_logged_track_id; if (r.entries_delta) Serial.printf("[CV] entry +%d (total %d) first=%.1f min=%.1f max=%.1f last=%.1f dur=%d\n", r.entries_delta, g_cv.entries, r.fire_first_c, r.fire_min_c, r.fire_max_c, r.fire_last_c, r.fire_duration); @@ -119,7 +119,9 @@ static void task_reporter(void*) { last_report_ts = now; // Deinit BLE to free ~25KB heap for SSL handshakes +#if BLE_SCANNING_ENABLED ble_scanner_deinit(); +#endif led_set(true); // on = uploading CameraHourlyRecord cam_rec; @@ -129,18 +131,26 @@ static void task_reporter(void*) { xSemaphoreGive(s_cv_mutex); } else { // Failed to acquire — skip this cycle, will report next hour +#if BLE_SCANNING_ENABLED ble_scanner_reinit(); +#endif led_set(false); continue; } +#if !BLE_SCANNING_ENABLED + BLEHourlyRecord ble_rec = {period_start, period_end, 0, 0}; +#else BLEHourlyRecord ble_rec = ble_scanner_collect(period_start, period_end); +#endif reporter_submit_camera(g_cfg, cam_rec); reporter_submit_ble(g_cfg, ble_rec); bool hb_ok = reporter_heartbeat(g_cfg, millis() / 1000, WiFi.RSSI()); +#if BLE_SCANNING_ENABLED ble_scanner_reinit(); +#endif led_set(false); static uint8_t consecutive_misses = 0; @@ -202,6 +212,11 @@ void setup() { ESP.restart(); } + // Boot connect happens before net_guard registers its WiFi event handler, + // so the GOT_IP-driven DNS override there won't fire for this association. + // Pin DNS now; net_guard re-applies it on every subsequent reconnect. + net_guard_pin_dns(); + net_guard_start(g_cfg); led_set(false); // off = connected @@ -220,17 +235,29 @@ void setup() { reporter_init(); +#if BLE_SCANNING_ENABLED ble_scanner_start(); +#endif // OTA update support ArduinoOTA.setHostname(g_cfg.device_id.c_str()); +#if !BLE_SCANNING_ENABLED + ArduinoOTA.onStart([]() { }); +#else ArduinoOTA.onStart([]() { ble_scanner_pause(); }); +#endif ArduinoOTA.onEnd([]() { +#if BLE_SCANNING_ENABLED ble_scanner_resume(); +#endif event_log_write(EVT_REBOOT, REBOOT_OTA, 0); ESP.restart(); }); +#if !BLE_SCANNING_ENABLED + ArduinoOTA.onError([](ota_error_t e) { }); +#else ArduinoOTA.onError([](ota_error_t e) { ble_scanner_resume(); }); +#endif ArduinoOTA.begin(); s_cv_mutex = xSemaphoreCreateMutex(); diff --git a/firmware/src/reporter.cpp b/firmware/src/reporter.cpp index b3ed72b..cda1536 100644 --- a/firmware/src/reporter.cpp +++ b/firmware/src/reporter.cpp @@ -26,28 +26,107 @@ static uint32_t now_ts() { return (uint32_t)time(nullptr); } +// Last successfully resolved IP — used as a warm fallback if a subsequent +// resolution fails. Never takes precedence over a fresh successful resolve. +static IPAddress s_cached_api_ip; + +// Resolve the API host. Tries hostByName first; on failure falls back to the +// last good resolution, then to the hardcoded fallback IP. Returns the IP via +// out-param and a label describing where it came from for logging. +static bool resolve_api_ip(IPAddress& out, const char*& source) { + IPAddress ip; + uint32_t r0 = millis(); + bool ok = WiFi.hostByName(REPORTER_API_HOST_NAME, ip); + uint32_t elapsed = millis() - r0; + if (ok) { + s_cached_api_ip = ip; + out = ip; + source = "dns"; + Serial.printf("[DNS] %s -> %s (%u ms)\n", + REPORTER_API_HOST_NAME, ip.toString().c_str(), (unsigned)elapsed); + return true; + } + Serial.printf("[DNS] %s -> FAIL (%u ms)\n", + REPORTER_API_HOST_NAME, (unsigned)elapsed); + net_guard_dump_dns("on-fail"); + net_guard_pin_dns(); // re-assert in case something overwrote the table + + if ((uint32_t)s_cached_api_ip != 0) { + out = s_cached_api_ip; + source = "cache"; + return true; + } + if (out.fromString(REPORTER_API_FALLBACK_IP)) { + source = "fallback"; + return true; + } + return false; +} + +// Drains and parses the HTTP response status line. Returns the numeric status +// code, or -1 on read timeout / malformed response. +static int read_http_status(WiFiClient& client, uint32_t timeout_ms) { + uint32_t deadline = millis() + timeout_ms; + while (!client.available() && millis() < deadline) vTaskDelay(pdMS_TO_TICKS(10)); + if (!client.available()) return -1; + String line = client.readStringUntil('\n'); + line.trim(); + // Format: "HTTP/1.1 200 OK" + int sp1 = line.indexOf(' '); + if (sp1 < 0) return -1; + int sp2 = line.indexOf(' ', sp1 + 1); + String code_str = (sp2 > 0) ? line.substring(sp1 + 1, sp2) : line.substring(sp1 + 1); + return code_str.toInt(); +} + static bool post_json_once(const DeviceConfig& cfg, const char* path, const String& body) { uint32_t ts = now_ts(); if (ts < 1700000000UL) return false; String sig = hmac_sign(cfg.hmac_secret, "POST", path, ts, body); if (sig.isEmpty()) return false; - HTTPClient http; - String url = String(REPORTER_API_HOST) + path; - http.begin(url); - http.setConnectTimeout(5000); // DNS + TCP connect - http.setTimeout(10000); // per-transaction response timeout - http.addHeader("Content-Type", "application/json"); - http.addHeader("X-Device-Id", cfg.device_id); - http.addHeader("X-Timestamp", String(ts)); - http.addHeader("X-Signature", sig); + IPAddress ip; + const char* ip_source = "?"; + if (!resolve_api_ip(ip, ip_source)) { + Serial.printf("[HTTP] POST %s -> resolve-fail\n", path); + event_log_write(EVT_HTTP_FAIL, event_log_path_hash(path), (uint16_t)-1); + return false; + } uint32_t t0 = millis(); - int code = http.POST(body); + WiFiClient client; + client.setTimeout(10); // seconds — read timeout + if (!client.connect(ip, REPORTER_API_PORT, 5000 /*ms connect timeout*/)) { + uint32_t elapsed = millis() - t0; + Serial.printf("[HTTP] connect %s:%u (%s) -> failed (%u ms)\n", + ip.toString().c_str(), REPORTER_API_PORT, ip_source, (unsigned)elapsed); + event_log_write(EVT_HTTP_FAIL, event_log_path_hash(path), (uint16_t)-1); + return false; + } + + // Manual HTTP/1.1 — gives us full control over the Host header so the + // server's vhost routing works even when we connect by IP. + client.printf("POST %s HTTP/1.1\r\n", path); + client.printf("Host: %s\r\n", REPORTER_API_HOST_NAME); + client.print ("Connection: close\r\n"); + client.print ("Content-Type: application/json\r\n"); + client.printf("Content-Length: %u\r\n", (unsigned)body.length()); + client.printf("X-Device-Id: %s\r\n", cfg.device_id.c_str()); + client.printf("X-Timestamp: %u\r\n", (unsigned)ts); + client.printf("X-Signature: %s\r\n", sig.c_str()); + client.print ("\r\n"); + client.print(body); + + int code = read_http_status(client, 10000); + // Drain so the server can close cleanly. + while (client.connected() && client.available()) client.read(); + client.stop(); + uint32_t elapsed = millis() - t0; - http.end(); uint16_t phash = event_log_path_hash(path); - Serial.printf("[HTTP] POST %s -> %d (%u ms)\n", url.c_str(), code, (unsigned)elapsed); + Serial.printf("[HTTP] POST %s%s (%s %s) -> %d (%u ms)\n", + REPORTER_API_HOST_NAME, path, ip_source, ip.toString().c_str(), + code, (unsigned)elapsed); if (code == 200) { event_log_write(EVT_HTTP_OK, phash, (uint16_t)((elapsed > 65535) ? 65535 : elapsed)); return true; diff --git a/firmware/src/reporter.h b/firmware/src/reporter.h index cd3f49e..cae399c 100644 --- a/firmware/src/reporter.h +++ b/firmware/src/reporter.h @@ -11,8 +11,13 @@ struct CameraHourlyRecord { int exits; }; -static const int REPORTER_MAX_BUFFER = 24; -static const char* REPORTER_API_HOST = "http://logs.research.bike"; +static const int REPORTER_MAX_BUFFER = 24; +static const char* REPORTER_API_HOST_NAME = "logs.research.bike"; +static const uint16_t REPORTER_API_PORT = 80; +// Hardcoded fallback used when DNS fails (some customer networks intercept +// :53 with a transparent proxy that mangles responses). Update if the +// server's IP changes — but a successful hostByName() always wins over this. +static const char* REPORTER_API_FALLBACK_IP = "5.78.114.131"; void reporter_init(); void reporter_submit_camera(const DeviceConfig& cfg, const CameraHourlyRecord& rec);