fix(firmware): upgrade NimBLE to 2.x + DNS fallback for unreliable resolvers

NimBLE-Arduino 1.4.2 had an init/fire race in its FreeRTOS callout porting
layer where os_callout_timer_cb dispatched a queued TimerHandle expiry
against a not-yet-initialized event (NULL fn pointer), causing PC=0
InstrFetchProhibited within ~1s of boot when the camera task starved the
timer service. Confirmed by ets_printf instrumentation. Upgrading to
^2.0.0 rewrites the porting layer and eliminates the race; verified clean
on the customer network for 1+ hour.

Also rolls in DNS-resilience work that surfaced the BLE crash during
provisioning: pin lwIP/esp-netif resolvers to 1.1.1.1/8.8.8.8 across DHCP
renewals, add three-tier resolver fallback in reporter with a hardcoded
IP of last resort, and switch to raw WiFiClient with manual Host header
to bypass HTTPClient's brittle DNS path.

Migration touches for NimBLE 2.x:
- NimBLEAdvertisedDeviceCallbacks -> NimBLEScanCallbacks
- onResult signature now takes const NimBLEAdvertisedDevice*
- setAdvertisedDeviceCallbacks -> setScanCallbacks
- start(0, nullptr, false) -> start(0, false, false)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-01 11:34:17 -07:00
parent 461ed7d888
commit a585a56cff
9 changed files with 1434 additions and 31 deletions

View File

@@ -42,8 +42,8 @@ static String sha256_prefix(const String& input) {
return hex;
}
class ScanCallback : public NimBLEAdvertisedDeviceCallbacks {
void onResult(NimBLEAdvertisedDevice* dev) override {
class ScanCallback : public NimBLEScanCallbacks {
void onResult(const NimBLEAdvertisedDevice* dev) override {
String mac = String(dev->getAddress().toString().c_str());
String hash = sha256_prefix(mac);
int rssi = dev->getRSSI();
@@ -51,7 +51,6 @@ class ScanCallback : public NimBLEAdvertisedDeviceCallbacks {
std::lock_guard<std::mutex> lock(s_mutex);
auto it = s_seen.find(hash);
if (it == s_seen.end()) {
Serial.printf("[BLE] new device: %s (rssi %d)\n", hash.c_str(), rssi);
s_seen[hash] = {rssi, 1};
} else {
it->second.rssi_sum += rssi;
@@ -68,16 +67,16 @@ static NimBLEScan* s_scan = nullptr;
void ble_scanner_start() {
NimBLEDevice::init("");
s_scan = NimBLEDevice::getScan();
s_scan->setAdvertisedDeviceCallbacks(&s_callback, true); // true = allow duplicates
s_scan->setScanCallbacks(&s_callback, true); // true = allow duplicates
s_scan->setActiveScan(false); // passive
s_scan->setInterval(100);
s_scan->setWindow(99);
s_scan->setMaxResults(0); // don't store results — callback-only
s_scan->start(0, nullptr, false); // 0 = continuous
s_scan->start(0, false, false); // duration=0 (forever), isContinue=false, restart=false
}
void ble_scanner_pause() { if (s_scan) s_scan->stop(); }
void ble_scanner_resume() { if (s_scan) s_scan->start(0, nullptr, false); }
void ble_scanner_resume() { if (s_scan) s_scan->start(0, false, false); }
void ble_scanner_deinit() {
if (s_scan) s_scan->stop();

View File

@@ -19,6 +19,15 @@
#define BUTTON_PIN 37
#define FACTORY_RESET_HOLD_MS 5000
// BLE scanning disabled in production until the NimBLE-Arduino 1.4.2 timer
// race is resolved. Symptom: FreeRTOS timer task dispatches an
// os_callout_timer_cb whose callback fn is NULL, causing PC=0 fetch and
// Historical note: NimBLE-Arduino 1.4.2 had an init/fire race in its FreeRTOS
// callout porting layer that caused a NULL-fn dispatch (PC=0,
// InstrFetchProhibited) within ~1s of boot when the camera task starved the
// timer service. Fixed by upgrading to 2.x (see platformio.ini).
#define BLE_SCANNING_ENABLED 1
#define CAM_FPS 5
#define CAM_INTERVAL_MS (1000 / CAM_FPS)
#define REPORT_INTERVAL_S 3600
@@ -67,16 +76,7 @@ static void task_camera(void*) {
if (camera_capture_96(frame)) {
if (xSemaphoreTake(s_cv_mutex, pdMS_TO_TICKS(100)) == pdTRUE) {
CVResult r = cv_process(g_cv, frame, g_cfg.line_offset);
for (const auto& t : g_cv.tracks) {
if (t.id > last_logged_track_id) {
last_logged_track_id = t.id;
Serial.printf("[CV] spawn id=%d y=%.1f\n", t.id, t.spawn_y);
}
}
if (r.fg_count > 0) {
Serial.printf("[F] n=%d y=%d..%d c=%.1f\n",
r.fg_count, r.fg_min_y, r.fg_max_y, r.fg_centroid_y);
}
(void)last_logged_track_id;
if (r.entries_delta) Serial.printf("[CV] entry +%d (total %d) first=%.1f min=%.1f max=%.1f last=%.1f dur=%d\n",
r.entries_delta, g_cv.entries,
r.fire_first_c, r.fire_min_c, r.fire_max_c, r.fire_last_c, r.fire_duration);
@@ -119,7 +119,9 @@ static void task_reporter(void*) {
last_report_ts = now;
// Deinit BLE to free ~25KB heap for SSL handshakes
#if BLE_SCANNING_ENABLED
ble_scanner_deinit();
#endif
led_set(true); // on = uploading
CameraHourlyRecord cam_rec;
@@ -129,18 +131,26 @@ static void task_reporter(void*) {
xSemaphoreGive(s_cv_mutex);
} else {
// Failed to acquire — skip this cycle, will report next hour
#if BLE_SCANNING_ENABLED
ble_scanner_reinit();
#endif
led_set(false);
continue;
}
#if !BLE_SCANNING_ENABLED
BLEHourlyRecord ble_rec = {period_start, period_end, 0, 0};
#else
BLEHourlyRecord ble_rec = ble_scanner_collect(period_start, period_end);
#endif
reporter_submit_camera(g_cfg, cam_rec);
reporter_submit_ble(g_cfg, ble_rec);
bool hb_ok = reporter_heartbeat(g_cfg, millis() / 1000, WiFi.RSSI());
#if BLE_SCANNING_ENABLED
ble_scanner_reinit();
#endif
led_set(false);
static uint8_t consecutive_misses = 0;
@@ -202,6 +212,11 @@ void setup() {
ESP.restart();
}
// Boot connect happens before net_guard registers its WiFi event handler,
// so the GOT_IP-driven DNS override there won't fire for this association.
// Pin DNS now; net_guard re-applies it on every subsequent reconnect.
net_guard_pin_dns();
net_guard_start(g_cfg);
led_set(false); // off = connected
@@ -220,17 +235,29 @@ void setup() {
reporter_init();
#if BLE_SCANNING_ENABLED
ble_scanner_start();
#endif
// OTA update support
ArduinoOTA.setHostname(g_cfg.device_id.c_str());
#if !BLE_SCANNING_ENABLED
ArduinoOTA.onStart([]() { });
#else
ArduinoOTA.onStart([]() { ble_scanner_pause(); });
#endif
ArduinoOTA.onEnd([]() {
#if BLE_SCANNING_ENABLED
ble_scanner_resume();
#endif
event_log_write(EVT_REBOOT, REBOOT_OTA, 0);
ESP.restart();
});
#if !BLE_SCANNING_ENABLED
ArduinoOTA.onError([](ota_error_t e) { });
#else
ArduinoOTA.onError([](ota_error_t e) { ble_scanner_resume(); });
#endif
ArduinoOTA.begin();
s_cv_mutex = xSemaphoreCreateMutex();

View File

@@ -26,28 +26,107 @@ static uint32_t now_ts() {
return (uint32_t)time(nullptr);
}
// Last successfully resolved IP — used as a warm fallback if a subsequent
// resolution fails. Never takes precedence over a fresh successful resolve.
static IPAddress s_cached_api_ip;
// Resolve the API host. Tries hostByName first; on failure falls back to the
// last good resolution, then to the hardcoded fallback IP. Returns the IP via
// out-param and a label describing where it came from for logging.
static bool resolve_api_ip(IPAddress& out, const char*& source) {
IPAddress ip;
uint32_t r0 = millis();
bool ok = WiFi.hostByName(REPORTER_API_HOST_NAME, ip);
uint32_t elapsed = millis() - r0;
if (ok) {
s_cached_api_ip = ip;
out = ip;
source = "dns";
Serial.printf("[DNS] %s -> %s (%u ms)\n",
REPORTER_API_HOST_NAME, ip.toString().c_str(), (unsigned)elapsed);
return true;
}
Serial.printf("[DNS] %s -> FAIL (%u ms)\n",
REPORTER_API_HOST_NAME, (unsigned)elapsed);
net_guard_dump_dns("on-fail");
net_guard_pin_dns(); // re-assert in case something overwrote the table
if ((uint32_t)s_cached_api_ip != 0) {
out = s_cached_api_ip;
source = "cache";
return true;
}
if (out.fromString(REPORTER_API_FALLBACK_IP)) {
source = "fallback";
return true;
}
return false;
}
// Drains and parses the HTTP response status line. Returns the numeric status
// code, or -1 on read timeout / malformed response.
static int read_http_status(WiFiClient& client, uint32_t timeout_ms) {
uint32_t deadline = millis() + timeout_ms;
while (!client.available() && millis() < deadline) vTaskDelay(pdMS_TO_TICKS(10));
if (!client.available()) return -1;
String line = client.readStringUntil('\n');
line.trim();
// Format: "HTTP/1.1 200 OK"
int sp1 = line.indexOf(' ');
if (sp1 < 0) return -1;
int sp2 = line.indexOf(' ', sp1 + 1);
String code_str = (sp2 > 0) ? line.substring(sp1 + 1, sp2) : line.substring(sp1 + 1);
return code_str.toInt();
}
static bool post_json_once(const DeviceConfig& cfg, const char* path, const String& body) {
uint32_t ts = now_ts();
if (ts < 1700000000UL) return false;
String sig = hmac_sign(cfg.hmac_secret, "POST", path, ts, body);
if (sig.isEmpty()) return false;
HTTPClient http;
String url = String(REPORTER_API_HOST) + path;
http.begin(url);
http.setConnectTimeout(5000); // DNS + TCP connect
http.setTimeout(10000); // per-transaction response timeout
http.addHeader("Content-Type", "application/json");
http.addHeader("X-Device-Id", cfg.device_id);
http.addHeader("X-Timestamp", String(ts));
http.addHeader("X-Signature", sig);
IPAddress ip;
const char* ip_source = "?";
if (!resolve_api_ip(ip, ip_source)) {
Serial.printf("[HTTP] POST %s -> resolve-fail\n", path);
event_log_write(EVT_HTTP_FAIL, event_log_path_hash(path), (uint16_t)-1);
return false;
}
uint32_t t0 = millis();
int code = http.POST(body);
WiFiClient client;
client.setTimeout(10); // seconds — read timeout
if (!client.connect(ip, REPORTER_API_PORT, 5000 /*ms connect timeout*/)) {
uint32_t elapsed = millis() - t0;
Serial.printf("[HTTP] connect %s:%u (%s) -> failed (%u ms)\n",
ip.toString().c_str(), REPORTER_API_PORT, ip_source, (unsigned)elapsed);
event_log_write(EVT_HTTP_FAIL, event_log_path_hash(path), (uint16_t)-1);
return false;
}
// Manual HTTP/1.1 — gives us full control over the Host header so the
// server's vhost routing works even when we connect by IP.
client.printf("POST %s HTTP/1.1\r\n", path);
client.printf("Host: %s\r\n", REPORTER_API_HOST_NAME);
client.print ("Connection: close\r\n");
client.print ("Content-Type: application/json\r\n");
client.printf("Content-Length: %u\r\n", (unsigned)body.length());
client.printf("X-Device-Id: %s\r\n", cfg.device_id.c_str());
client.printf("X-Timestamp: %u\r\n", (unsigned)ts);
client.printf("X-Signature: %s\r\n", sig.c_str());
client.print ("\r\n");
client.print(body);
int code = read_http_status(client, 10000);
// Drain so the server can close cleanly.
while (client.connected() && client.available()) client.read();
client.stop();
uint32_t elapsed = millis() - t0;
http.end();
uint16_t phash = event_log_path_hash(path);
Serial.printf("[HTTP] POST %s -> %d (%u ms)\n", url.c_str(), code, (unsigned)elapsed);
Serial.printf("[HTTP] POST %s%s (%s %s) -> %d (%u ms)\n",
REPORTER_API_HOST_NAME, path, ip_source, ip.toString().c_str(),
code, (unsigned)elapsed);
if (code == 200) {
event_log_write(EVT_HTTP_OK, phash, (uint16_t)((elapsed > 65535) ? 65535 : elapsed));
return true;

View File

@@ -11,8 +11,13 @@ struct CameraHourlyRecord {
int exits;
};
static const int REPORTER_MAX_BUFFER = 24;
static const char* REPORTER_API_HOST = "http://logs.research.bike";
static const int REPORTER_MAX_BUFFER = 24;
static const char* REPORTER_API_HOST_NAME = "logs.research.bike";
static const uint16_t REPORTER_API_PORT = 80;
// Hardcoded fallback used when DNS fails (some customer networks intercept
// :53 with a transparent proxy that mangles responses). Update if the
// server's IP changes — but a successful hostByName() always wins over this.
static const char* REPORTER_API_FALLBACK_IP = "5.78.114.131";
void reporter_init();
void reporter_submit_camera(const DeviceConfig& cfg, const CameraHourlyRecord& rec);