NMI Fault without any obvious fault bits set
Hi everyone,
I'm currently experiencing a strange crash on a STM32G473 and I'm a bit stumped on how to debug it.
First the crash:
The system is a STM32G473 running FreeRTOS V10.5.1. I have a simple FDCAN ISR which takes the incoming CAN frames and pushes it to a FreeRTOS queue for use in user space in a task; nothing fancy except this, just transforming the data to a nicer structure and clearing the FIFO and so on. During high bus loads however, the FDCAN ISR will sometimes fire during a FreeRTOS context switch. When it does, the system crashes, which is a problem in and of itself, but the main issue is that the NMI is triggered and not the Hardfault.
SRAM Parity is enabled, but not Clock Security.
Attached is the NMI Handler. I'm checking every bit in HFSR, CFSR, AFSR, The parity error bit and the flash ecc bit and even the css bit are checked. I'm stepping through the function and checking if anything could have triggered the NMI. None of the relevant bits are set to 1 however. I can clearly see that the memcpy() in the FreeRTOS queue-pushing is triggering the NMI; and the callstack in the debugger is very clear that an FDCAN ISR was triggered during the context switch.
Other things tried: I've checked the Vector Table to see if the NMI handler had ended up at another position, which it hasn't. The hardfault handler works as intended (its a simple while(1) { __asm("nop"); } right now. I've checked the errata and couldn't find anything related to the NMI.
So my question(s) is: How do I properly debug the NMI and why does it trigger instead of a Hardfault? Are there any more registers I need to check to determine why we are in the fault handler?
/**
* @brief Assembler part of the NMI handler
*
* Determine which stack pointer (MSP or PSP) was in use when the system crashed.
* Put the stack pointer into r0 and call a C function to handle the exception.
* R0 will be the first argument to the C function and we can unwind the stack
*/
__attribute__((naked)) void NMI_Handler(void)
{
__asm(
"TST LR, #4\n" /* Check EXC_RETURN value in LR */
"ITE EQ\n" /* If equal (zero), use MSP; else, use PSP */
"MRSEQ R0, MSP\n" /* Move MSP to r0 if LR[2] == 0 */
"MRSNE R0, PSP\n" /* Move PSP to r0 if LR[2] != 0 */
"B nmi_handler_c\n"); /* Branch to the C handler passing r0 (stack pointer) as argument. */
}
/**
* @brief C-part of the NMI handler
*
* stacked_registers Pointer to the stack
*/
void nmi_handler_c(unsigned int* stacked_registers)
{
volatile unsigned int hfsr = SCB->HFSR; /* Hard Fault Status Register */
volatile unsigned int cfsr = SCB->CFSR; /* Configurable Fault Status Register */
volatile unsigned int mmfar = SCB->MMFAR; /* Memory Management Fault Address Register */
volatile unsigned int bfar = SCB->BFAR; /* Bus Fault Address Register */
volatile unsigned int afsr = SCB->AFSR; /* Aux Fault Address Register */
volatile unsigned int sram_parity = SYSCFG->CFGR2 & SYSCFG_CFGR2_SPF;
volatile unsigned int flash_error = FLASH->ECCR & (FLASH_ECCR_ECCD2 | FLASH_ECCR_ECCD);
volatile unsigned int css_error = RCC->CIFR & (RCC_CIFR_CSSF | RCC_CIFR_LSECSSF);
// --- SRAM and Flash Parity errors ---
if (sram_parity)
{
// SRAM parity failed
__asm("nop");
}
if (flash_error)
{
// Flash ECC error
__asm("nop");
}
if (css_error)
{
// Clock Security error
__asm("nop");
}
// --- Memory Management Fault Analysis (CFSR bits 0-7) ---
if (cfsr & (1 << 0))
{
// IACCVIOL: An instruction access violation occurred.
__asm("nop");
}
if (cfsr & (1 << 1))
{
// DACCVIOL: A data access violation occurred.
__asm("nop");
}
if (cfsr & (1 << 3))
{
// MUNSTKERR: Unstacking error during exception return.
__asm("nop");
}
if (cfsr & (1 << 4))
{
// MSTKERR: Stacking error during exception entry.
__asm("nop");
}
if (cfsr & (1 << 5))
{
// MLSPERR: Lazy state preservation error occurred.
__asm("nop");
}
if (cfsr & (1 << 7))
{
// MMARVALID is set: The MMFAR register holds a valid memory fault address.
// Check mmfar to see the address that triggered the memory management fault.
mmfar = mmfar;
__asm("nop");
}
// --- Bus Fault Analysis (CFSR bits 8-15) ---
if (cfsr & (1 << 8))
{
// IBUSERR: An instruction bus error occurred.
__asm("nop");
}
if (cfsr & (1 << 9))
{
// PRECISERR: A precise data bus error occurred.
__asm("nop");
}
if (cfsr & (1 << 10))
{
// IMPRECISERR: An imprecise data bus error occurred.
__asm("nop");
}
if (cfsr & (1 << 11))
{
// UNSTKERR: Unstacking error during exception return (bus fault).
__asm("nop");
}
if (cfsr & (1 << 12))
{
// STKERR: Stacking error during exception entry (bus fault).
__asm("nop");
}
if (cfsr & (1 << 13))
{
// LSPERR: Lazy state preservation error on bus fault.
__asm("nop");
}
// ---------------------- Bus Fault Analysis ------------------------
if (cfsr & (1 << 15))
{
// BFARVALID is set: The BFAR register holds a valid bus fault address.
// Check bfar to see the address related to the bus fault.
bfar = bfar;
__asm("nop");
}
// --- Usage Fault Analysis (CFSR bits 16-31) ---
if (cfsr & (1 << 16))
{
// UNDEFINSTR: An undefined instruction was executed.
__asm("nop");
}
if (cfsr & (1 << 17))
{
// INVSTATE: Invalid state occurred (possibly an invalid EPSR value).
__asm("nop");
}
if (cfsr & (1 << 18))
{
// INVPC: Invalid PC load; may indicate a bad EXC_RETURN value.
__asm("nop");
}
if (cfsr & (1 << 19))
{
// NOCP: Attempted to use a coprocessor that is not present.
__asm("nop");
}
if (cfsr & (1 << 24))
{
// UNALIGNED: Unaligned access error occurred.
__asm("nop");
}
if (cfsr & (1 << 25))
{
// DIVBYZERO: Division by zero error occurred.
__asm("nop");
}
// --- Hard Fault Status Analysis (HFSR) ---
if (hfsr & (1 << 1))
{
// VECTTBL: Bus fault on vector table read during exception processing.
__asm("nop");
}
if (hfsr & (1 << 30))
{
// FORCED: A configurable fault (memory management, bus, or usage fault) escalated to a hard fault.
__asm("nop");
}
__asm("bkpt 1");
}


