fix(firmware): reboot on FATAL failures + emit NTP_SYNC + server-coord warning
- Config-load and camera-init FATAL branches now reboot (3s LED signal before restart) instead of hanging forever. Matches the enum name REBOOT_FATAL_* and makes camera-init failures diagnosable via the next boot's heartbeat recent_events. Config failures produce a visible reboot loop rather than a silent hang. - Emit EVT_NTP_SYNC(seconds_since_boot) on the first NTP-synced reporter iteration so slow / failed NTP sync is a visible signal in the heartbeat's recent_events window. - README "Deploying firmware 1.1" now opens with a "Before you flash" warning directing the operator to land server-side heartbeat schema changes first (migration 005 + stub integration) to avoid a strict-schema 4xx reboot loop after deployment.
This commit is contained in:
22
README.md
22
README.md
@@ -200,6 +200,28 @@ python tools/serial_monitor.py --port /dev/ttyUSB0 --reset --timestamp --seconds
|
||||
|
||||
## Deploying firmware 1.1 (network resilience)
|
||||
|
||||
### Before you flash
|
||||
|
||||
Firmware 1.1 adds five new fields to the `POST /api/v1/heartbeat` payload
|
||||
(`reset_reason`, `heap_free`, `heap_min_free`, `last_disconnect_code`,
|
||||
`recent_events`). **The real server must accept these optional fields before
|
||||
you deploy firmware 1.1**, or strict-schema validation will 4xx every
|
||||
heartbeat; after 6 consecutive misses (~6h) the heartbeat-miss watchdog
|
||||
will reboot the device, producing a reboot loop.
|
||||
|
||||
Reference migration and handler code for the real server are in this repo:
|
||||
|
||||
- `server/heartbeat_diagnostics_stub.py` — Pydantic model extensions,
|
||||
`store_heartbeat_diagnostics()` helper, and `EVENT_TAG_DECODER` /
|
||||
`REBOOT_REASON_DECODER` reference tables.
|
||||
- `server/migrations/005_heartbeat_diagnostics.sql` — adds five nullable
|
||||
columns to the `heartbeats` table (adjust table name to match the real
|
||||
server's schema).
|
||||
|
||||
Copy the stub additions into the production server repo, run the
|
||||
migration, and confirm a v1.1.0-shape heartbeat returns 200 before you
|
||||
flash any device.
|
||||
|
||||
### Flash command
|
||||
|
||||
```bash
|
||||
|
||||
@@ -107,6 +107,7 @@ static void task_reporter(void*) {
|
||||
|
||||
// First valid timestamp — schedule boot report 60s from now
|
||||
if (last_report_ts == 0) {
|
||||
event_log_write(EVT_NTP_SYNC, (uint16_t)(millis() / 1000), 0);
|
||||
last_report_ts = now - (REPORT_INTERVAL_S - BOOT_REPORT_DELAY_S);
|
||||
continue;
|
||||
}
|
||||
@@ -170,7 +171,14 @@ void setup() {
|
||||
if (!config_load(g_cfg)) {
|
||||
Serial.println("FATAL: device_id/location_id/hmac_secret not provisioned");
|
||||
event_log_write(EVT_REBOOT, REBOOT_FATAL_CONFIG, 0);
|
||||
while (true) { delay(500); led_set(!digitalRead(LED_PIN)); } // fast blink
|
||||
// Blink fast for 3s so a physically-present operator can see it,
|
||||
// then reboot so EVT_BOOT history on the next heartbeat surfaces
|
||||
// the failure — though in this case the device can't heartbeat
|
||||
// without config, so the real signal is the fast-blink-then-reboot
|
||||
// cycle visible on the LED.
|
||||
uint32_t t0 = millis();
|
||||
while (millis() - t0 < 3000) { led_set(!digitalRead(LED_PIN)); delay(100); }
|
||||
ESP.restart();
|
||||
}
|
||||
|
||||
// Connect to WiFi
|
||||
@@ -205,7 +213,9 @@ void setup() {
|
||||
if (!camera_init()) {
|
||||
Serial.println("FATAL: camera init failed");
|
||||
event_log_write(EVT_REBOOT, REBOOT_FATAL_CAMERA, 0);
|
||||
while (true) delay(1000);
|
||||
uint32_t t0 = millis();
|
||||
while (millis() - t0 < 3000) { led_set(!digitalRead(LED_PIN)); delay(100); }
|
||||
ESP.restart();
|
||||
}
|
||||
|
||||
reporter_init();
|
||||
|
||||
Reference in New Issue
Block a user