fix(firmware): upgrade NimBLE to 2.x + DNS fallback for unreliable resolvers
NimBLE-Arduino 1.4.2 had an init/fire race in its FreeRTOS callout porting layer where os_callout_timer_cb dispatched a queued TimerHandle expiry against a not-yet-initialized event (NULL fn pointer), causing PC=0 InstrFetchProhibited within ~1s of boot when the camera task starved the timer service. Confirmed by ets_printf instrumentation. Upgrading to ^2.0.0 rewrites the porting layer and eliminates the race; verified clean on the customer network for 1+ hour. Also rolls in DNS-resilience work that surfaced the BLE crash during provisioning: pin lwIP/esp-netif resolvers to 1.1.1.1/8.8.8.8 across DHCP renewals, add three-tier resolver fallback in reporter with a hardcoded IP of last resort, and switch to raw WiFiClient with manual Host header to bypass HTTPClient's brittle DNS path. Migration touches for NimBLE 2.x: - NimBLEAdvertisedDeviceCallbacks -> NimBLEScanCallbacks - onResult signature now takes const NimBLEAdvertisedDevice* - setAdvertisedDeviceCallbacks -> setScanCallbacks - start(0, nullptr, false) -> start(0, false, false) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -9,8 +9,66 @@ uint32_t net_guard_next_backoff_ms(uint32_t attempt) {
|
||||
#ifdef ARDUINO
|
||||
#include "config.h"
|
||||
#include <WiFi.h>
|
||||
#include <Arduino.h>
|
||||
#include <lwip/dns.h>
|
||||
#include <esp_netif.h>
|
||||
#include "event_log.h"
|
||||
|
||||
// Both lwIP's ip_addr_t and esp-netif's esp_ip_addr_t alias the same on-disk
|
||||
// layout for IPv4, but the C++ types differ. Take the raw u32 to sidestep it.
|
||||
static String fmt_v4(uint32_t addr_be) {
|
||||
if (addr_be == 0) return String("0.0.0.0");
|
||||
char b[16];
|
||||
snprintf(b, sizeof(b), "%u.%u.%u.%u",
|
||||
(unsigned)((addr_be >> 0) & 0xFF),
|
||||
(unsigned)((addr_be >> 8) & 0xFF),
|
||||
(unsigned)((addr_be >> 16) & 0xFF),
|
||||
(unsigned)((addr_be >> 24) & 0xFF));
|
||||
return String(b);
|
||||
}
|
||||
|
||||
void net_guard_dump_dns(const char* tag) {
|
||||
const ip_addr_t* d0 = dns_getserver(0);
|
||||
const ip_addr_t* d1 = dns_getserver(1);
|
||||
Serial.printf("[DNS] %s lwip: %s , %s\n", tag,
|
||||
fmt_v4(d0 ? ip_2_ip4(d0)->addr : 0).c_str(),
|
||||
fmt_v4(d1 ? ip_2_ip4(d1)->addr : 0).c_str());
|
||||
|
||||
esp_netif_t* sta = esp_netif_get_handle_from_ifkey("WIFI_STA_DEF");
|
||||
if (sta) {
|
||||
esp_netif_dns_info_t main_dns{}, backup_dns{};
|
||||
esp_netif_get_dns_info(sta, ESP_NETIF_DNS_MAIN, &main_dns);
|
||||
esp_netif_get_dns_info(sta, ESP_NETIF_DNS_BACKUP, &backup_dns);
|
||||
Serial.printf("[DNS] %s netif: %s , %s\n", tag,
|
||||
fmt_v4(main_dns.ip.u_addr.ip4.addr).c_str(),
|
||||
fmt_v4(backup_dns.ip.u_addr.ip4.addr).c_str());
|
||||
} else {
|
||||
Serial.printf("[DNS] %s netif: <no STA handle>\n", tag);
|
||||
}
|
||||
}
|
||||
|
||||
void net_guard_pin_dns() {
|
||||
ip_addr_t d1, d2;
|
||||
IP_ADDR4(&d1, 1, 1, 1, 1);
|
||||
IP_ADDR4(&d2, 8, 8, 8, 8);
|
||||
dns_setserver(0, &d1);
|
||||
dns_setserver(1, &d2);
|
||||
|
||||
// Also push through the esp_netif layer. dns_setserver() writes the
|
||||
// global lwIP table directly; esp_netif_set_dns_info() is what the
|
||||
// DHCP client itself calls, so writing here prevents the next DHCP
|
||||
// event from silently overwriting our pin.
|
||||
esp_netif_t* sta = esp_netif_get_handle_from_ifkey("WIFI_STA_DEF");
|
||||
if (sta) {
|
||||
esp_netif_dns_info_t info{};
|
||||
IP_ADDR4(&info.ip, 1, 1, 1, 1);
|
||||
esp_netif_set_dns_info(sta, ESP_NETIF_DNS_MAIN, &info);
|
||||
IP_ADDR4(&info.ip, 8, 8, 8, 8);
|
||||
esp_netif_set_dns_info(sta, ESP_NETIF_DNS_BACKUP, &info);
|
||||
}
|
||||
net_guard_dump_dns("pinned");
|
||||
}
|
||||
|
||||
// Shared with the WiFi event task. 32-bit aligned loads/stores are atomic on
|
||||
// Xtensa; volatile suffices. Tick re-evaluates every loop iteration, so stale
|
||||
// reads self-correct within ~200ms.
|
||||
@@ -23,6 +81,11 @@ static volatile uint32_t s_next_retry_ms = 0;
|
||||
static void on_wifi_event(WiFiEvent_t event, WiFiEventInfo_t info) {
|
||||
switch (event) {
|
||||
case ARDUINO_EVENT_WIFI_STA_GOT_IP:
|
||||
// Override DHCP-supplied DNS. Some routers return TC=1 for short
|
||||
// answers (forcing TCP fallback that lwIP can't follow), or hand
|
||||
// out an unreachable resolver. Pin to public resolvers so
|
||||
// hostByName() never depends on the local network's DNS quality.
|
||||
net_guard_pin_dns();
|
||||
s_up = true;
|
||||
s_attempts = 0;
|
||||
s_next_retry_ms = 0;
|
||||
|
||||
@@ -21,4 +21,13 @@ uint8_t net_guard_last_disconnect_reason();
|
||||
|
||||
// Non-blocking tick called from loop(); kicks reconnect if due.
|
||||
extern "C" void net_guard_tick();
|
||||
|
||||
// Override DHCP-supplied DNS with public resolvers (1.1.1.1, 8.8.8.8).
|
||||
// Idempotent; safe to call repeatedly. net_guard re-applies on every GOT_IP,
|
||||
// but main.cpp must call it once for the boot association (which completes
|
||||
// before net_guard_start() registers its event handler).
|
||||
void net_guard_pin_dns();
|
||||
|
||||
// Diagnostic: print current DNS table state from both lwIP and esp_netif.
|
||||
void net_guard_dump_dns(const char* tag);
|
||||
#endif
|
||||
|
||||
@@ -21,7 +21,7 @@ upload_flags = --no-stub
|
||||
lib_deps =
|
||||
tzapu/WiFiManager@^2.0.17
|
||||
bblanchon/ArduinoJson@^7.0.0
|
||||
h2zero/NimBLE-Arduino@^1.4.2
|
||||
h2zero/NimBLE-Arduino@^2.0.0
|
||||
espressif/esp32-camera
|
||||
|
||||
; Frame-capture build. Strips WiFi/BLE/CV/reporter; streams raw 96x96 frames
|
||||
|
||||
@@ -42,8 +42,8 @@ static String sha256_prefix(const String& input) {
|
||||
return hex;
|
||||
}
|
||||
|
||||
class ScanCallback : public NimBLEAdvertisedDeviceCallbacks {
|
||||
void onResult(NimBLEAdvertisedDevice* dev) override {
|
||||
class ScanCallback : public NimBLEScanCallbacks {
|
||||
void onResult(const NimBLEAdvertisedDevice* dev) override {
|
||||
String mac = String(dev->getAddress().toString().c_str());
|
||||
String hash = sha256_prefix(mac);
|
||||
int rssi = dev->getRSSI();
|
||||
@@ -51,7 +51,6 @@ class ScanCallback : public NimBLEAdvertisedDeviceCallbacks {
|
||||
std::lock_guard<std::mutex> lock(s_mutex);
|
||||
auto it = s_seen.find(hash);
|
||||
if (it == s_seen.end()) {
|
||||
Serial.printf("[BLE] new device: %s (rssi %d)\n", hash.c_str(), rssi);
|
||||
s_seen[hash] = {rssi, 1};
|
||||
} else {
|
||||
it->second.rssi_sum += rssi;
|
||||
@@ -68,16 +67,16 @@ static NimBLEScan* s_scan = nullptr;
|
||||
void ble_scanner_start() {
|
||||
NimBLEDevice::init("");
|
||||
s_scan = NimBLEDevice::getScan();
|
||||
s_scan->setAdvertisedDeviceCallbacks(&s_callback, true); // true = allow duplicates
|
||||
s_scan->setScanCallbacks(&s_callback, true); // true = allow duplicates
|
||||
s_scan->setActiveScan(false); // passive
|
||||
s_scan->setInterval(100);
|
||||
s_scan->setWindow(99);
|
||||
s_scan->setMaxResults(0); // don't store results — callback-only
|
||||
s_scan->start(0, nullptr, false); // 0 = continuous
|
||||
s_scan->start(0, false, false); // duration=0 (forever), isContinue=false, restart=false
|
||||
}
|
||||
|
||||
void ble_scanner_pause() { if (s_scan) s_scan->stop(); }
|
||||
void ble_scanner_resume() { if (s_scan) s_scan->start(0, nullptr, false); }
|
||||
void ble_scanner_resume() { if (s_scan) s_scan->start(0, false, false); }
|
||||
|
||||
void ble_scanner_deinit() {
|
||||
if (s_scan) s_scan->stop();
|
||||
|
||||
@@ -19,6 +19,15 @@
|
||||
#define BUTTON_PIN 37
|
||||
#define FACTORY_RESET_HOLD_MS 5000
|
||||
|
||||
// BLE scanning disabled in production until the NimBLE-Arduino 1.4.2 timer
|
||||
// race is resolved. Symptom: FreeRTOS timer task dispatches an
|
||||
// os_callout_timer_cb whose callback fn is NULL, causing PC=0 fetch and
|
||||
// Historical note: NimBLE-Arduino 1.4.2 had an init/fire race in its FreeRTOS
|
||||
// callout porting layer that caused a NULL-fn dispatch (PC=0,
|
||||
// InstrFetchProhibited) within ~1s of boot when the camera task starved the
|
||||
// timer service. Fixed by upgrading to 2.x (see platformio.ini).
|
||||
#define BLE_SCANNING_ENABLED 1
|
||||
|
||||
#define CAM_FPS 5
|
||||
#define CAM_INTERVAL_MS (1000 / CAM_FPS)
|
||||
#define REPORT_INTERVAL_S 3600
|
||||
@@ -67,16 +76,7 @@ static void task_camera(void*) {
|
||||
if (camera_capture_96(frame)) {
|
||||
if (xSemaphoreTake(s_cv_mutex, pdMS_TO_TICKS(100)) == pdTRUE) {
|
||||
CVResult r = cv_process(g_cv, frame, g_cfg.line_offset);
|
||||
for (const auto& t : g_cv.tracks) {
|
||||
if (t.id > last_logged_track_id) {
|
||||
last_logged_track_id = t.id;
|
||||
Serial.printf("[CV] spawn id=%d y=%.1f\n", t.id, t.spawn_y);
|
||||
}
|
||||
}
|
||||
if (r.fg_count > 0) {
|
||||
Serial.printf("[F] n=%d y=%d..%d c=%.1f\n",
|
||||
r.fg_count, r.fg_min_y, r.fg_max_y, r.fg_centroid_y);
|
||||
}
|
||||
(void)last_logged_track_id;
|
||||
if (r.entries_delta) Serial.printf("[CV] entry +%d (total %d) first=%.1f min=%.1f max=%.1f last=%.1f dur=%d\n",
|
||||
r.entries_delta, g_cv.entries,
|
||||
r.fire_first_c, r.fire_min_c, r.fire_max_c, r.fire_last_c, r.fire_duration);
|
||||
@@ -119,7 +119,9 @@ static void task_reporter(void*) {
|
||||
last_report_ts = now;
|
||||
|
||||
// Deinit BLE to free ~25KB heap for SSL handshakes
|
||||
#if BLE_SCANNING_ENABLED
|
||||
ble_scanner_deinit();
|
||||
#endif
|
||||
led_set(true); // on = uploading
|
||||
|
||||
CameraHourlyRecord cam_rec;
|
||||
@@ -129,18 +131,26 @@ static void task_reporter(void*) {
|
||||
xSemaphoreGive(s_cv_mutex);
|
||||
} else {
|
||||
// Failed to acquire — skip this cycle, will report next hour
|
||||
#if BLE_SCANNING_ENABLED
|
||||
ble_scanner_reinit();
|
||||
#endif
|
||||
led_set(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
#if !BLE_SCANNING_ENABLED
|
||||
BLEHourlyRecord ble_rec = {period_start, period_end, 0, 0};
|
||||
#else
|
||||
BLEHourlyRecord ble_rec = ble_scanner_collect(period_start, period_end);
|
||||
#endif
|
||||
|
||||
reporter_submit_camera(g_cfg, cam_rec);
|
||||
reporter_submit_ble(g_cfg, ble_rec);
|
||||
bool hb_ok = reporter_heartbeat(g_cfg, millis() / 1000, WiFi.RSSI());
|
||||
|
||||
#if BLE_SCANNING_ENABLED
|
||||
ble_scanner_reinit();
|
||||
#endif
|
||||
led_set(false);
|
||||
|
||||
static uint8_t consecutive_misses = 0;
|
||||
@@ -202,6 +212,11 @@ void setup() {
|
||||
ESP.restart();
|
||||
}
|
||||
|
||||
// Boot connect happens before net_guard registers its WiFi event handler,
|
||||
// so the GOT_IP-driven DNS override there won't fire for this association.
|
||||
// Pin DNS now; net_guard re-applies it on every subsequent reconnect.
|
||||
net_guard_pin_dns();
|
||||
|
||||
net_guard_start(g_cfg);
|
||||
led_set(false); // off = connected
|
||||
|
||||
@@ -220,17 +235,29 @@ void setup() {
|
||||
|
||||
reporter_init();
|
||||
|
||||
#if BLE_SCANNING_ENABLED
|
||||
ble_scanner_start();
|
||||
#endif
|
||||
|
||||
// OTA update support
|
||||
ArduinoOTA.setHostname(g_cfg.device_id.c_str());
|
||||
#if !BLE_SCANNING_ENABLED
|
||||
ArduinoOTA.onStart([]() { });
|
||||
#else
|
||||
ArduinoOTA.onStart([]() { ble_scanner_pause(); });
|
||||
#endif
|
||||
ArduinoOTA.onEnd([]() {
|
||||
#if BLE_SCANNING_ENABLED
|
||||
ble_scanner_resume();
|
||||
#endif
|
||||
event_log_write(EVT_REBOOT, REBOOT_OTA, 0);
|
||||
ESP.restart();
|
||||
});
|
||||
#if !BLE_SCANNING_ENABLED
|
||||
ArduinoOTA.onError([](ota_error_t e) { });
|
||||
#else
|
||||
ArduinoOTA.onError([](ota_error_t e) { ble_scanner_resume(); });
|
||||
#endif
|
||||
ArduinoOTA.begin();
|
||||
|
||||
s_cv_mutex = xSemaphoreCreateMutex();
|
||||
|
||||
@@ -26,28 +26,107 @@ static uint32_t now_ts() {
|
||||
return (uint32_t)time(nullptr);
|
||||
}
|
||||
|
||||
// Last successfully resolved IP — used as a warm fallback if a subsequent
|
||||
// resolution fails. Never takes precedence over a fresh successful resolve.
|
||||
static IPAddress s_cached_api_ip;
|
||||
|
||||
// Resolve the API host. Tries hostByName first; on failure falls back to the
|
||||
// last good resolution, then to the hardcoded fallback IP. Returns the IP via
|
||||
// out-param and a label describing where it came from for logging.
|
||||
static bool resolve_api_ip(IPAddress& out, const char*& source) {
|
||||
IPAddress ip;
|
||||
uint32_t r0 = millis();
|
||||
bool ok = WiFi.hostByName(REPORTER_API_HOST_NAME, ip);
|
||||
uint32_t elapsed = millis() - r0;
|
||||
if (ok) {
|
||||
s_cached_api_ip = ip;
|
||||
out = ip;
|
||||
source = "dns";
|
||||
Serial.printf("[DNS] %s -> %s (%u ms)\n",
|
||||
REPORTER_API_HOST_NAME, ip.toString().c_str(), (unsigned)elapsed);
|
||||
return true;
|
||||
}
|
||||
Serial.printf("[DNS] %s -> FAIL (%u ms)\n",
|
||||
REPORTER_API_HOST_NAME, (unsigned)elapsed);
|
||||
net_guard_dump_dns("on-fail");
|
||||
net_guard_pin_dns(); // re-assert in case something overwrote the table
|
||||
|
||||
if ((uint32_t)s_cached_api_ip != 0) {
|
||||
out = s_cached_api_ip;
|
||||
source = "cache";
|
||||
return true;
|
||||
}
|
||||
if (out.fromString(REPORTER_API_FALLBACK_IP)) {
|
||||
source = "fallback";
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Drains and parses the HTTP response status line. Returns the numeric status
|
||||
// code, or -1 on read timeout / malformed response.
|
||||
static int read_http_status(WiFiClient& client, uint32_t timeout_ms) {
|
||||
uint32_t deadline = millis() + timeout_ms;
|
||||
while (!client.available() && millis() < deadline) vTaskDelay(pdMS_TO_TICKS(10));
|
||||
if (!client.available()) return -1;
|
||||
String line = client.readStringUntil('\n');
|
||||
line.trim();
|
||||
// Format: "HTTP/1.1 200 OK"
|
||||
int sp1 = line.indexOf(' ');
|
||||
if (sp1 < 0) return -1;
|
||||
int sp2 = line.indexOf(' ', sp1 + 1);
|
||||
String code_str = (sp2 > 0) ? line.substring(sp1 + 1, sp2) : line.substring(sp1 + 1);
|
||||
return code_str.toInt();
|
||||
}
|
||||
|
||||
static bool post_json_once(const DeviceConfig& cfg, const char* path, const String& body) {
|
||||
uint32_t ts = now_ts();
|
||||
if (ts < 1700000000UL) return false;
|
||||
String sig = hmac_sign(cfg.hmac_secret, "POST", path, ts, body);
|
||||
if (sig.isEmpty()) return false;
|
||||
|
||||
HTTPClient http;
|
||||
String url = String(REPORTER_API_HOST) + path;
|
||||
http.begin(url);
|
||||
http.setConnectTimeout(5000); // DNS + TCP connect
|
||||
http.setTimeout(10000); // per-transaction response timeout
|
||||
http.addHeader("Content-Type", "application/json");
|
||||
http.addHeader("X-Device-Id", cfg.device_id);
|
||||
http.addHeader("X-Timestamp", String(ts));
|
||||
http.addHeader("X-Signature", sig);
|
||||
IPAddress ip;
|
||||
const char* ip_source = "?";
|
||||
if (!resolve_api_ip(ip, ip_source)) {
|
||||
Serial.printf("[HTTP] POST %s -> resolve-fail\n", path);
|
||||
event_log_write(EVT_HTTP_FAIL, event_log_path_hash(path), (uint16_t)-1);
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t t0 = millis();
|
||||
int code = http.POST(body);
|
||||
WiFiClient client;
|
||||
client.setTimeout(10); // seconds — read timeout
|
||||
if (!client.connect(ip, REPORTER_API_PORT, 5000 /*ms connect timeout*/)) {
|
||||
uint32_t elapsed = millis() - t0;
|
||||
Serial.printf("[HTTP] connect %s:%u (%s) -> failed (%u ms)\n",
|
||||
ip.toString().c_str(), REPORTER_API_PORT, ip_source, (unsigned)elapsed);
|
||||
event_log_write(EVT_HTTP_FAIL, event_log_path_hash(path), (uint16_t)-1);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Manual HTTP/1.1 — gives us full control over the Host header so the
|
||||
// server's vhost routing works even when we connect by IP.
|
||||
client.printf("POST %s HTTP/1.1\r\n", path);
|
||||
client.printf("Host: %s\r\n", REPORTER_API_HOST_NAME);
|
||||
client.print ("Connection: close\r\n");
|
||||
client.print ("Content-Type: application/json\r\n");
|
||||
client.printf("Content-Length: %u\r\n", (unsigned)body.length());
|
||||
client.printf("X-Device-Id: %s\r\n", cfg.device_id.c_str());
|
||||
client.printf("X-Timestamp: %u\r\n", (unsigned)ts);
|
||||
client.printf("X-Signature: %s\r\n", sig.c_str());
|
||||
client.print ("\r\n");
|
||||
client.print(body);
|
||||
|
||||
int code = read_http_status(client, 10000);
|
||||
// Drain so the server can close cleanly.
|
||||
while (client.connected() && client.available()) client.read();
|
||||
client.stop();
|
||||
|
||||
uint32_t elapsed = millis() - t0;
|
||||
http.end();
|
||||
uint16_t phash = event_log_path_hash(path);
|
||||
Serial.printf("[HTTP] POST %s -> %d (%u ms)\n", url.c_str(), code, (unsigned)elapsed);
|
||||
Serial.printf("[HTTP] POST %s%s (%s %s) -> %d (%u ms)\n",
|
||||
REPORTER_API_HOST_NAME, path, ip_source, ip.toString().c_str(),
|
||||
code, (unsigned)elapsed);
|
||||
if (code == 200) {
|
||||
event_log_write(EVT_HTTP_OK, phash, (uint16_t)((elapsed > 65535) ? 65535 : elapsed));
|
||||
return true;
|
||||
|
||||
@@ -11,8 +11,13 @@ struct CameraHourlyRecord {
|
||||
int exits;
|
||||
};
|
||||
|
||||
static const int REPORTER_MAX_BUFFER = 24;
|
||||
static const char* REPORTER_API_HOST = "http://logs.research.bike";
|
||||
static const int REPORTER_MAX_BUFFER = 24;
|
||||
static const char* REPORTER_API_HOST_NAME = "logs.research.bike";
|
||||
static const uint16_t REPORTER_API_PORT = 80;
|
||||
// Hardcoded fallback used when DNS fails (some customer networks intercept
|
||||
// :53 with a transparent proxy that mangles responses). Update if the
|
||||
// server's IP changes — but a successful hostByName() always wins over this.
|
||||
static const char* REPORTER_API_FALLBACK_IP = "5.78.114.131";
|
||||
|
||||
void reporter_init();
|
||||
void reporter_submit_camera(const DeviceConfig& cfg, const CameraHourlyRecord& rec);
|
||||
|
||||
Reference in New Issue
Block a user