Skip to main content
Graduate
January 28, 2021
Question

Why does my Tx-only software 3 MBaud UART sometimes send strange characters?

  • January 28, 2021
  • 11 replies
  • 3543 views

I am working with the STM32F769 microcontroller and it's using FreeRTOS operating system. It uses the following clock configuration (216 MHz SystemCoreClock):

/** System Clock Configuration
*/
void SystemClock_Config(void)
{
 
 RCC_OscInitTypeDef RCC_OscInitStruct = {0};
 RCC_ClkInitTypeDef RCC_ClkInitStruct = {0};
 RCC_PeriphCLKInitTypeDef PeriphClkInitStruct = {0};
 
 /**Configure the main internal regulator output voltage
 */
 __HAL_RCC_PWR_CLK_ENABLE();
 
 __HAL_PWR_VOLTAGESCALING_CONFIG(PWR_REGULATOR_VOLTAGE_SCALE3);
 
 /**Initializes the CPU, AHB and APB busses clocks
 */
 RCC_OscInitStruct.OscillatorType = RCC_OSCILLATORTYPE_LSI|RCC_OSCILLATORTYPE_HSE;
 RCC_OscInitStruct.HSEState = RCC_HSE_ON;
 RCC_OscInitStruct.LSIState = RCC_LSI_ON;
 RCC_OscInitStruct.PLL.PLLState = RCC_PLL_ON;
 RCC_OscInitStruct.PLL.PLLSource = RCC_PLLSOURCE_HSE;
 RCC_OscInitStruct.PLL.PLLM = 25;
 RCC_OscInitStruct.PLL.PLLN = 432;
 RCC_OscInitStruct.PLL.PLLQ = 9;
 RCC_OscInitStruct.PLL.PLLP = RCC_PLLP_DIV2;
 
 RCC_OscInitStruct.PLL.PLLR = 2; /* Even when DSI is disabled PLLR with 2 <= PLLR <= 7 according to RM0410 p.163 */
 if (HAL_RCC_OscConfig(&RCC_OscInitStruct) != HAL_OK)
 {
 Error_Handler();
 }
 /** Activate the Over-Drive mode
 */
 if (HAL_PWREx_EnableOverDrive() != HAL_OK)
 {
 Error_Handler();
 }
 
 
 /**Initializes the CPU, AHB and APB busses clocks
 */
 RCC_ClkInitStruct.ClockType = RCC_CLOCKTYPE_HCLK|RCC_CLOCKTYPE_SYSCLK
 |RCC_CLOCKTYPE_PCLK1|RCC_CLOCKTYPE_PCLK2;
 RCC_ClkInitStruct.SYSCLKSource = RCC_SYSCLKSOURCE_PLLCLK;
 RCC_ClkInitStruct.AHBCLKDivider = RCC_SYSCLK_DIV1;
 RCC_ClkInitStruct.APB1CLKDivider = RCC_HCLK_DIV4;
 RCC_ClkInitStruct.APB2CLKDivider = RCC_HCLK_DIV2;
 
 if (HAL_RCC_ClockConfig(&RCC_ClkInitStruct, FLASH_LATENCY_7) != HAL_OK)
 {
 Error_Handler();
 }

I have written a software transmit-only UART running at 3 MBaud on any arbitrary GPIO pin. For the most part it works great, but every once in a while, especially in interrupts, it starts "writing in Chinese" and strange characters appear in my terminal window. Does anybody know what's wrong?

#define IDLE_STATE 1
#define DISABLE_ALL_INTS_IF_NECESSARY() uint32_t old_primask; \
 old_primask = __get_PRIMASK(); \
 __disable_irq()
 
#define ENABLE_ALL_INTS_IF_THEY_WERE_ENABLED() if (!old_primask) \
 { \
 __enable_irq(); \
 }
static GPIO_TypeDef* TxPort_;
static uint16_t TxPin_;
 
void LL_UART_SW_TxOnly_enable(GPIO_TypeDef* TxPort, uint16_t TxPin) {
 TxPort_ = TxPort;
 TxPin_ = TxPin;
 GPIO_InitTypeDef initStruct = {TxPin_, GPIO_MODE_OUTPUT_PP, GPIO_NOPULL, GPIO_SPEED_LOW, 0};
 HAL_GPIO_Init(TxPort_, &initStruct);
 SET_PIN(TxPort_, TxPin_, IDLE_STATE);
}
 
void LL_UART_SW_TxOnly_disable(void) {
 GPIO_InitTypeDef initStruct = {TxPin_, GPIO_MODE_ANALOG, GPIO_NOPULL, GPIO_SPEED_LOW, 0};
 HAL_GPIO_Init(TxPort_, &initStruct);
}
 
 /* 216 MHz system clock */
#define OUTPUT_BIT(__BIT_NO__) *BSRR = BSRRvalues[__BIT_NO__]; \
 *BSRR = BSRRvalues[__BIT_NO__]; \
 *BSRR = BSRRvalues[__BIT_NO__]; \
 *BSRR = BSRRvalues[__BIT_NO__]; \
 *BSRR = BSRRvalues[__BIT_NO__]; \
 *BSRR = BSRRvalues[__BIT_NO__]; \
 *BSRR = BSRRvalues[__BIT_NO__]; \
 *BSRR = BSRRvalues[__BIT_NO__]; \
 *BSRR = BSRRvalues[__BIT_NO__]; \
 *BSRR = BSRRvalues[__BIT_NO__]; \
 *BSRR = BSRRvalues[__BIT_NO__]; \
 *BSRR = BSRRvalues[__BIT_NO__]; \
 *BSRR = BSRRvalues[__BIT_NO__]; \
 *BSRR = BSRRvalues[__BIT_NO__];
 
inline static void __attribute__((optimize("O0"))) outputByte(volatile uint32_t* BSRR, uint32_t BSRRvalues[10]) {
 OUTPUT_BIT(0);
 OUTPUT_BIT(1);
 OUTPUT_BIT(2);
 OUTPUT_BIT(3);
 OUTPUT_BIT(4);
 OUTPUT_BIT(5);
 OUTPUT_BIT(6);
 OUTPUT_BIT(7);
 OUTPUT_BIT(8);
 *BSRR = BSRRvalues[9];
}
 
// Superfast (3 MBit/s) UART
void __attribute__((optimize("O3"))) LL_UART_SW_TxOnly_transmitByte(uint8_t byteToSend) {
 uint32_t BSRRvalues[10];
 BSRRvalues[0] = (IDLE_STATE == 1) ? (((uint32_t)TxPin_) << 16) : TxPin_; // Start bit
 BSRRvalues[9] = (IDLE_STATE == 0) ? (((uint32_t)TxPin_) << 16) : TxPin_; // Stop bit
 BSRRvalues[1] = BSRRvalues[(byteToSend & 0x01) ? 9 : 0]; // Bit 0
 BSRRvalues[2] = BSRRvalues[(byteToSend & 0x02) ? 9 : 0]; // Bit 1
 BSRRvalues[3] = BSRRvalues[(byteToSend & 0x04) ? 9 : 0]; // Bit 2
 BSRRvalues[4] = BSRRvalues[(byteToSend & 0x08) ? 9 : 0]; // Bit 3
 BSRRvalues[5] = BSRRvalues[(byteToSend & 0x10) ? 9 : 0]; // Bit 4
 BSRRvalues[6] = BSRRvalues[(byteToSend & 0x20) ? 9 : 0]; // Bit 5
 BSRRvalues[7] = BSRRvalues[(byteToSend & 0x40) ? 9 : 0]; // Bit 6
 BSRRvalues[8] = BSRRvalues[(byteToSend & 0x80) ? 9 : 0]; // Bit 7
 DISABLE_ALL_INTS_IF_NECESSARY();
 outputByte(&TxPort_->BSRR, BSRRvalues);
 ENABLE_ALL_INTS_IF_THEY_WERE_ENABLED();
}
 
 
void LL_UART_SW_TxOnly_transmitBuffer(uint8_t* pData, uint16_t numBytes) {
 while (0 < numBytes--) {
 LL_UART_SW_TxOnly_transmitByte(*pData++);
 }
}
 
void LL_UART_SW_TxOnly_transmitNullTermString(const char* pData) {
 while (*pData != 0) {
 LL_UART_SW_TxOnly_transmitByte(*pData++);
 }
}

    This topic has been closed for replies.

    11 replies

    Super User
    January 28, 2021

    This is not your friendly 8-bitter with predictible instruction timing.

    I'd start with rewriting this into inline/separate asm, then I'd continue with investigating execution jitter when running it from different memories (perhaps TCM RAM would be the best candidate?)

    I'd avoid the multiple writes to the register, it may unnecessarily collide with other busmasters (DMA).

    And I'd never, never use this in anything else but a debug printout.

    JW

    arnold_wAuthor
    Graduate
    January 28, 2021

    Yes, this is for debugging (I'm not a big fan of trace). The code works great in my STM32F405 and STM32F446 projects, that don't have any RTOS, even when called from interrupt handlers. Are there that big differences between the STM32F769 and the STM32F4 families that the microcontroller family (and thus, architecture) makes all the difference? Can't the RTOS create these problems? I'm calling __disable_irq() before outputting each byte, but I don't know if the RTOS can still perform context switching?

    Super User
    January 28, 2021

    > I'm calling __disable_irq() before outputting each byte,

    > but I don't know if the RTOS can still perform context switching?

    No. RTOS is no magic, it simply uses a timer interrupt.

    The Cortex-M7 as compared to Cortex-M4, is superscalar and features speculative elements like branch prediction, which increase execution jitter and decrease use control over timing. Plus the bus fabric is way more complex, adding further to the uncertainties.

    JW

    Super User
    January 28, 2021

    > I'm calling __disable_irq() before outputting each byte,

    > but I don't know if the RTOS can still perform context switching?

    No. RTOS is no magic, it simply uses a timer interrupt.

    The Cortex-M7 as compared to Cortex-M4, is superscalar and features speculative elements like branch prediction, which increase execution jitter and decrease use control over timing. Plus the bus fabric is way more complex, adding further to the uncertainties.

    JW

    Graduate II
    January 29, 2021

    For 216 MHz the PWR regulator voltage scaling must be set to scale 1 or not changed at all, because scale 1 is the reset value.

    arnold_wAuthor
    Graduate
    January 29, 2021

    By adding the word __RAM_FUNC to the outputByte and LL_UART_SW_TxOnly_transmitByte functions, I was able to make it a lot more reliable:

    inline static void __attribute__((optimize("O0"))) __RAM_FUNC outputByte(volatile uint32_t* BSRR, uint32_t BSRRvalues[10]) {
     .
    .
    .
    }
     
    void __attribute__((optimize("O3"))) __RAM_FUNC LL_UART_SW_TxOnly_transmitByte(uint8_t byteToSend) {
    .
    .
    .
    }

    However, when I tried to replace

    #define OUTPUT_BIT_(__BIT_NO__) *BSRR = BSRRvalues[__BIT_NO__]; \
     *BSRR = BSRRvalues[__BIT_NO__]; \
     *BSRR = BSRRvalues[__BIT_NO__]; \
     .
     .
     .

    with

    #define OUTPUT_BIT(__BIT_NO__) *BSRR = BSRRvalues[__BIT_NO__]; \
     asm("NOP"); \
     asm("NOP"); \
     asm("NOP"); \
     .
     .
     .

    then, to my surprise, it became unreliable again. I really couldn't make any sense anymore of what I saw on the oscilloscope when I measured the bit periods. Does anybody know why and does anybody know what I can replace the dangerous *BSRR = BSRRvalues[__BIT_NO__] assignments with instead?

    Super User
    January 29, 2021

    Check the generated assembly code. How does that look like?

    arnold_wAuthor
    Graduate
    January 29, 2021

    > I'd avoid the multiple writes to the register, it may unnecessarily collide with other busmasters (DMA).

    > But BSSR trickles through the bus maxtix with several clocks, while nop has no interaction with the bus matrix.

    I assume polling a timer interrupt flag would create just as much traffic on bus/matrix? But what if I would poll the DWT Cycle Counter, would that be a better choice?

    Graduate II
    January 29, 2021

    Both SysTick and DWT counters are local to CPU core and could be used with appropriate code. Though DWT could be easier as it's 32-bit. Still GPIO on AHB with polling some timer on APB also should be good enough.

    Take a look on SEGGER RTT, J-Scope and SystemView. ;)

    https://www.segger.com/products/debug-probes/j-link/technology/about-real-time-transfer/

    https://www.segger.com/products/debug-probes/j-link/tools/j-scope/

    https://www.segger.com/products/development-tools/systemview/

    Super User
    January 29, 2021

    DSB is often used as a time-wasting instruction instead of NOP.

    What's wrong with using hardware UARTs? Yes, there's limitation on which pins you can use them, and yes, there are only 8 of them in the 'F769; but OTOH they tend to work properly all the time.

    JW

    arnold_wAuthor
    Graduate
    January 30, 2021

    > What's wrong with using hardware UARTs?

    We have 10 microcontrollers on the PCB, all are ball grid arrays and all of them have hardware UART:s. On 9 of them we have filters that degrade the signals at high frequencies so we can't go any faster than 115200. The 10th microcontroller has trace implemented and I tried it once and never would I touch that again, not even with a polestick. So, if I want fast logging (highspeed buses are difficult to debug with a 115200 UART) without making another spin of the PCB, the only option left it to try to find nets that appear on pad/vias and can double-function as UART as well and then solder wires onto them.

    Graduate II
    February 1, 2021

    Have a look at orbuculum https://github.com/orbcode/orbuculum and the help you can get from Mubes on the discord orbuculum channel before you dump SWO/Trace.

    Super User
    January 29, 2021

    If its TX only you could try 10-bit SPI transfers and ignore the clock pin.

    Super User
    January 30, 2021

    Or, if you have 2 lines available, you can try to bitbang SPI.

    Btw on a development board, I'd simply remove the filters.

    JW

    Graduate
    January 31, 2021

    I sure wouldn't want to bit-bang at 3-megabaud !

    arnold_wAuthor
    Graduate
    January 31, 2021

    > I sure wouldn't want to bit-bang at 3-megabaud !

    Next week I'm planning to order 12 MBaud cables ( https://ftdichip.com/products/c232hd-ddhsp-0 ). Btw, does anybody know if there are even faster cables available on the market?