Two FATAL while(true) hangs in main.cpp (config load fail, camera init fail) previously relied on the hardware watchdog to reboot the device, leaving the cause invisible beyond a generic TWDT reset reason. Now each path logs EVT_REBOOT with REBOOT_FATAL_CONFIG or REBOOT_FATAL_CAMERA before hanging, so the next heartbeat's recent_events surfaces which branch hung. Server-side decoder updated for the two new enum values. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
128 lines
4.6 KiB
Python
128 lines
4.6 KiB
Python
# server/heartbeat_diagnostics_stub.py
|
|
# Add these models and the persistence helper to the server's main.py alongside
|
|
# the existing heartbeat endpoint (POST /api/v1/heartbeat).
|
|
# Requires: diagnostic columns on the heartbeats table (see migrations/005_heartbeat_diagnostics.sql)
|
|
#
|
|
# Firmware v1.1.0 extends the heartbeat payload with five optional diagnostic
|
|
# fields. v1.0.0-shape payloads (without these fields) must continue to parse
|
|
# cleanly — every new field is Optional and defaults to None.
|
|
#
|
|
# IMPORTANT: Adjust the table name in store_heartbeat_diagnostics to match the
|
|
# real server's schema if it differs from "heartbeats".
|
|
|
|
import json
|
|
import sqlite3
|
|
from typing import List, Optional
|
|
|
|
from pydantic import BaseModel
|
|
|
|
|
|
class RecentEvent(BaseModel):
|
|
t: int # EventLogTag (see EVENT_TAG_DECODER)
|
|
d0: int # tag-specific datum 0
|
|
d1: int # tag-specific datum 1
|
|
ts: int # unix timestamp (seconds)
|
|
up: int # seconds since boot when event was logged
|
|
|
|
|
|
# Extend the existing HeartbeatRequest model in main.py by adding these five
|
|
# optional fields. The rest of the heartbeat model (device_id, uptime, etc.)
|
|
# stays as-is. Shown here as a standalone model for reference/testing.
|
|
class HeartbeatDiagnosticsFields(BaseModel):
|
|
reset_reason: Optional[int] = None
|
|
heap_free: Optional[int] = None
|
|
heap_min_free: Optional[int] = None
|
|
last_disconnect_code: Optional[int] = None
|
|
recent_events: Optional[List[RecentEvent]] = None
|
|
|
|
|
|
# Example of the fully-extended heartbeat request model (merge into the
|
|
# existing HeartbeatRequest in main.py rather than introducing a second class):
|
|
class HeartbeatRequestWithDiagnostics(BaseModel):
|
|
device_id: str
|
|
uptime: int
|
|
# ... existing fields from the v1.0.0 heartbeat model go here ...
|
|
# New v1.1.0 diagnostic fields:
|
|
reset_reason: Optional[int] = None
|
|
heap_free: Optional[int] = None
|
|
heap_min_free: Optional[int] = None
|
|
last_disconnect_code: Optional[int] = None
|
|
recent_events: Optional[List[RecentEvent]] = None
|
|
|
|
|
|
# Call this inside the existing receive_heartbeat handler after the base
|
|
# heartbeat row has been inserted/updated. It persists the diagnostic fields
|
|
# on the same row keyed by device_id.
|
|
def store_heartbeat_diagnostics(
|
|
db: sqlite3.Connection,
|
|
device_id: str,
|
|
hb: HeartbeatRequestWithDiagnostics,
|
|
) -> None:
|
|
"""Persist the v1.1.0 diagnostic fields onto the heartbeats row for device_id.
|
|
|
|
recent_events is JSON-serialized into a TEXT column for flexibility;
|
|
the other four fields are stored as INTEGERs. All fields are nullable
|
|
and left untouched when the payload omits them (v1.0.0 compatibility).
|
|
"""
|
|
recent_events_json = (
|
|
json.dumps([ev.model_dump() for ev in hb.recent_events])
|
|
if hb.recent_events is not None
|
|
else None
|
|
)
|
|
cursor = db.cursor()
|
|
cursor.execute(
|
|
"""UPDATE heartbeats
|
|
SET reset_reason = ?,
|
|
heap_free = ?,
|
|
heap_min_free = ?,
|
|
last_disconnect_code = ?,
|
|
recent_events = ?
|
|
WHERE device_id = ?""",
|
|
(
|
|
hb.reset_reason,
|
|
hb.heap_free,
|
|
hb.heap_min_free,
|
|
hb.last_disconnect_code,
|
|
recent_events_json,
|
|
device_id,
|
|
),
|
|
)
|
|
db.commit()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Decoders — use these in dashboards / alerting to label the integer tags the
|
|
# firmware emits. Keep in sync with firmware/include/event_log.h.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# EventLogTag values (RecentEvent.t) -> human name.
|
|
# Per-tag interpretation of d0/d1:
|
|
# EVT_BOOT d0=esp_reset_reason()
|
|
# EVT_WIFI_UP d0=RSSI (int16 cast to uint16)
|
|
# EVT_WIFI_DOWN d0=disconnect reason (0xFF = silent-death)
|
|
# EVT_HTTP_OK d0=path_hash, d1=elapsed_ms
|
|
# EVT_HTTP_FAIL d0=path_hash, d1=http_status_or_errno
|
|
# EVT_HEARTBEAT_MISS d0=consecutive_count
|
|
# EVT_NTP_SYNC d0=seconds_since_boot (reserved, not emitted)
|
|
# EVT_REBOOT d0=RebootReason (see REBOOT_REASON_DECODER)
|
|
EVENT_TAG_DECODER = {
|
|
1: "EVT_BOOT",
|
|
2: "EVT_WIFI_UP",
|
|
3: "EVT_WIFI_DOWN",
|
|
4: "EVT_HTTP_OK",
|
|
5: "EVT_HTTP_FAIL",
|
|
6: "EVT_HEARTBEAT_MISS",
|
|
7: "EVT_NTP_SYNC",
|
|
8: "EVT_REBOOT",
|
|
}
|
|
|
|
# EVT_REBOOT.d0 values -> human name. Firmware-initiated reboot reasons.
|
|
REBOOT_REASON_DECODER = {
|
|
1: "HEARTBEAT_MISS",
|
|
2: "FACTORY_RESET",
|
|
3: "OTA",
|
|
4: "WIFI_REPROV",
|
|
5: "FATAL_CONFIG",
|
|
6: "FATAL_CAMERA",
|
|
}
|