Files
DoorCounter/server/heartbeat_diagnostics_stub.py
Peter Woolery d943b3df5a feat(firmware): log reason before FATAL hang loops
Two FATAL while(true) hangs in main.cpp (config load fail, camera init
fail) previously relied on the hardware watchdog to reboot the device,
leaving the cause invisible beyond a generic TWDT reset reason. Now
each path logs EVT_REBOOT with REBOOT_FATAL_CONFIG or REBOOT_FATAL_CAMERA
before hanging, so the next heartbeat's recent_events surfaces which
branch hung. Server-side decoder updated for the two new enum values.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 14:03:57 -07:00

128 lines
4.6 KiB
Python

# server/heartbeat_diagnostics_stub.py
# Add these models and the persistence helper to the server's main.py alongside
# the existing heartbeat endpoint (POST /api/v1/heartbeat).
# Requires: diagnostic columns on the heartbeats table (see migrations/005_heartbeat_diagnostics.sql)
#
# Firmware v1.1.0 extends the heartbeat payload with five optional diagnostic
# fields. v1.0.0-shape payloads (without these fields) must continue to parse
# cleanly — every new field is Optional and defaults to None.
#
# IMPORTANT: Adjust the table name in store_heartbeat_diagnostics to match the
# real server's schema if it differs from "heartbeats".
import json
import sqlite3
from typing import List, Optional
from pydantic import BaseModel
class RecentEvent(BaseModel):
t: int # EventLogTag (see EVENT_TAG_DECODER)
d0: int # tag-specific datum 0
d1: int # tag-specific datum 1
ts: int # unix timestamp (seconds)
up: int # seconds since boot when event was logged
# Extend the existing HeartbeatRequest model in main.py by adding these five
# optional fields. The rest of the heartbeat model (device_id, uptime, etc.)
# stays as-is. Shown here as a standalone model for reference/testing.
class HeartbeatDiagnosticsFields(BaseModel):
reset_reason: Optional[int] = None
heap_free: Optional[int] = None
heap_min_free: Optional[int] = None
last_disconnect_code: Optional[int] = None
recent_events: Optional[List[RecentEvent]] = None
# Example of the fully-extended heartbeat request model (merge into the
# existing HeartbeatRequest in main.py rather than introducing a second class):
class HeartbeatRequestWithDiagnostics(BaseModel):
device_id: str
uptime: int
# ... existing fields from the v1.0.0 heartbeat model go here ...
# New v1.1.0 diagnostic fields:
reset_reason: Optional[int] = None
heap_free: Optional[int] = None
heap_min_free: Optional[int] = None
last_disconnect_code: Optional[int] = None
recent_events: Optional[List[RecentEvent]] = None
# Call this inside the existing receive_heartbeat handler after the base
# heartbeat row has been inserted/updated. It persists the diagnostic fields
# on the same row keyed by device_id.
def store_heartbeat_diagnostics(
db: sqlite3.Connection,
device_id: str,
hb: HeartbeatRequestWithDiagnostics,
) -> None:
"""Persist the v1.1.0 diagnostic fields onto the heartbeats row for device_id.
recent_events is JSON-serialized into a TEXT column for flexibility;
the other four fields are stored as INTEGERs. All fields are nullable
and left untouched when the payload omits them (v1.0.0 compatibility).
"""
recent_events_json = (
json.dumps([ev.model_dump() for ev in hb.recent_events])
if hb.recent_events is not None
else None
)
cursor = db.cursor()
cursor.execute(
"""UPDATE heartbeats
SET reset_reason = ?,
heap_free = ?,
heap_min_free = ?,
last_disconnect_code = ?,
recent_events = ?
WHERE device_id = ?""",
(
hb.reset_reason,
hb.heap_free,
hb.heap_min_free,
hb.last_disconnect_code,
recent_events_json,
device_id,
),
)
db.commit()
# ---------------------------------------------------------------------------
# Decoders — use these in dashboards / alerting to label the integer tags the
# firmware emits. Keep in sync with firmware/include/event_log.h.
# ---------------------------------------------------------------------------
# EventLogTag values (RecentEvent.t) -> human name.
# Per-tag interpretation of d0/d1:
# EVT_BOOT d0=esp_reset_reason()
# EVT_WIFI_UP d0=RSSI (int16 cast to uint16)
# EVT_WIFI_DOWN d0=disconnect reason (0xFF = silent-death)
# EVT_HTTP_OK d0=path_hash, d1=elapsed_ms
# EVT_HTTP_FAIL d0=path_hash, d1=http_status_or_errno
# EVT_HEARTBEAT_MISS d0=consecutive_count
# EVT_NTP_SYNC d0=seconds_since_boot (reserved, not emitted)
# EVT_REBOOT d0=RebootReason (see REBOOT_REASON_DECODER)
EVENT_TAG_DECODER = {
1: "EVT_BOOT",
2: "EVT_WIFI_UP",
3: "EVT_WIFI_DOWN",
4: "EVT_HTTP_OK",
5: "EVT_HTTP_FAIL",
6: "EVT_HEARTBEAT_MISS",
7: "EVT_NTP_SYNC",
8: "EVT_REBOOT",
}
# EVT_REBOOT.d0 values -> human name. Firmware-initiated reboot reasons.
REBOOT_REASON_DECODER = {
1: "HEARTBEAT_MISS",
2: "FACTORY_RESET",
3: "OTA",
4: "WIFI_REPROV",
5: "FATAL_CONFIG",
6: "FATAL_CAMERA",
}