From a795cfa0ade47d3433b772136569a760bea25b37 Mon Sep 17 00:00:00 2001 From: Peter Woolery Date: Thu, 23 Apr 2026 14:10:32 -0700 Subject: [PATCH] fix(firmware): reboot on FATAL failures + emit NTP_SYNC + server-coord warning - Config-load and camera-init FATAL branches now reboot (3s LED signal before restart) instead of hanging forever. Matches the enum name REBOOT_FATAL_* and makes camera-init failures diagnosable via the next boot's heartbeat recent_events. Config failures produce a visible reboot loop rather than a silent hang. - Emit EVT_NTP_SYNC(seconds_since_boot) on the first NTP-synced reporter iteration so slow / failed NTP sync is a visible signal in the heartbeat's recent_events window. - README "Deploying firmware 1.1" now opens with a "Before you flash" warning directing the operator to land server-side heartbeat schema changes first (migration 005 + stub integration) to avoid a strict-schema 4xx reboot loop after deployment. --- README.md | 22 ++++++++++++++++++++++ firmware/src/main.cpp | 14 ++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7237d71..1dfcb0e 100644 --- a/README.md +++ b/README.md @@ -200,6 +200,28 @@ python tools/serial_monitor.py --port /dev/ttyUSB0 --reset --timestamp --seconds ## Deploying firmware 1.1 (network resilience) +### Before you flash + +Firmware 1.1 adds five new fields to the `POST /api/v1/heartbeat` payload +(`reset_reason`, `heap_free`, `heap_min_free`, `last_disconnect_code`, +`recent_events`). **The real server must accept these optional fields before +you deploy firmware 1.1**, or strict-schema validation will 4xx every +heartbeat; after 6 consecutive misses (~6h) the heartbeat-miss watchdog +will reboot the device, producing a reboot loop. + +Reference migration and handler code for the real server are in this repo: + +- `server/heartbeat_diagnostics_stub.py` — Pydantic model extensions, + `store_heartbeat_diagnostics()` helper, and `EVENT_TAG_DECODER` / + `REBOOT_REASON_DECODER` reference tables. +- `server/migrations/005_heartbeat_diagnostics.sql` — adds five nullable + columns to the `heartbeats` table (adjust table name to match the real + server's schema). + +Copy the stub additions into the production server repo, run the +migration, and confirm a v1.1.0-shape heartbeat returns 200 before you +flash any device. + ### Flash command ```bash diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp index 6f5576c..765ac6c 100644 --- a/firmware/src/main.cpp +++ b/firmware/src/main.cpp @@ -107,6 +107,7 @@ static void task_reporter(void*) { // First valid timestamp — schedule boot report 60s from now if (last_report_ts == 0) { + event_log_write(EVT_NTP_SYNC, (uint16_t)(millis() / 1000), 0); last_report_ts = now - (REPORT_INTERVAL_S - BOOT_REPORT_DELAY_S); continue; } @@ -170,7 +171,14 @@ void setup() { if (!config_load(g_cfg)) { Serial.println("FATAL: device_id/location_id/hmac_secret not provisioned"); event_log_write(EVT_REBOOT, REBOOT_FATAL_CONFIG, 0); - while (true) { delay(500); led_set(!digitalRead(LED_PIN)); } // fast blink + // Blink fast for 3s so a physically-present operator can see it, + // then reboot so EVT_BOOT history on the next heartbeat surfaces + // the failure — though in this case the device can't heartbeat + // without config, so the real signal is the fast-blink-then-reboot + // cycle visible on the LED. + uint32_t t0 = millis(); + while (millis() - t0 < 3000) { led_set(!digitalRead(LED_PIN)); delay(100); } + ESP.restart(); } // Connect to WiFi @@ -205,7 +213,9 @@ void setup() { if (!camera_init()) { Serial.println("FATAL: camera init failed"); event_log_write(EVT_REBOOT, REBOOT_FATAL_CAMERA, 0); - while (true) delay(1000); + uint32_t t0 = millis(); + while (millis() - t0 < 3000) { led_set(!digitalRead(LED_PIN)); delay(100); } + ESP.restart(); } reporter_init();