I first finished the SPI RX DMA setup. I simply changed a few settings, and actually included the arming of both streams.

// Reset Stream
  DMA1_Stream3->CR = 0;
  while (DMA1_Stream3->CR & DMA_SxCR_EN);

  // Set source peripheral pointer
  DMA1_Stream3->PAR = (uint32_t)&SPI2->DR;
  // Set buffer pointer
  DMA1_Stream3->M0AR = (uint32_t)cmd_data_buf;
  // Transfer 11 bytes
  DMA1_Stream3->NDTR = 11;
  // Enabled direct mode (no FIFO)
  DMA1_Stream3->FCR = 0;

  DMA1_Stream3->CR =
      DMA_SxCR_TCIE               // Enable transfer complete interrupt
      | (0 << DMA_SxCR_DIR_Pos)   // Peripheral to Memory
      | DMA_SxCR_MINC             // Enable memory increment mode
      | (0 << DMA_SxCR_PSIZE_Pos) // Set 8-bit peripheral data size
      | (0 << DMA_SxCR_MSIZE_Pos) // Set 8-bit memory data size
      // Set priority level to medium. Doesn't actually matter since TIMs and
      // IDR are on on DMA2
      | (1 << DMA_SxCR_PL_Pos) |
      (0 << DMA_SxCR_CHSEL_Pos); // Set channel 0 [RM0090 Table 43]

  DMA1_Stream4->CR = 0;
  while (DMA1_Stream4->CR & DMA_SxCR_EN);

  // Set source periphal pointer
  DMA1_Stream4->PAR = (uint32_t)&SPI2->DR;
  // Set buffer pointer
  DMA1_Stream4->M0AR = (uint32_t)erpm_data_buf;
  // Transfer 11 bytes
  DMA1_Stream4->NDTR = 11;
  // Enable direct mode (no FIFO)
  DMA1_Stream4->FCR = 0;

  DMA1_Stream4->CR =
      (1 << DMA_SxCR_DIR_Pos)     // Memory to Peripheral
      | DMA_SxCR_MINC             // Enable memory increment mode
      | (0 << DMA_SxCR_PSIZE_Pos) // Set 8-bit peripheral data size
      | (0 << DMA_SxCR_MSIZE_Pos) // Set 8-bit memory data size
      // Set priority level to medium. Doesn't actually
      // matter since TIMs and IDR are on on DMA2
      | (1 << DMA_SxCR_PL_Pos) |
      (0 << DMA_SxCR_CHSEL_Pos); // Set channel 0 [RM0090 Table 43]

  // Arm both streams
  DMA1_Stream3->CR |= DMA_SxCR_EN;
  DMA1_Stream4->CR |= DMA_SxCR_EN;
}

After examining the protocol for re-syching after a CRC failure and looking at methods for frame synchronization I decided to change to hardware NSS, which required some changes to the SPI and PIN setup. I also finished the CRC setup int he SPI configuration, selecting the 0x2F polynomial to allow for triple bit error detection.

// Enable SPI2 Clock
  RCC->APB1ENR |= RCC_APB1ENR_SPI2EN;
  (void)RCC->APB1ENR;

  SPI2->CR1 =
      // Set data capture on first capture edge
      (0 << SPI_CR1_CPHA_Pos)
      // Set idle low
      | (0 << SPI_CR1_CPOL_Pos)
      // Set slave mode
      | (0 << SPI_CR1_MSTR_Pos)
      // Set MSB first
      | (0 << SPI_CR1_LSBFIRST_Pos)
      // Enable Hardware SS
      | (0 << SPI_CR1_SSM_Pos)
      // Set full duplex
      | (0 << SPI_CR1_RXONLY_Pos);

  // Set CRC polynomial to x^8 + x^5 + x^3 + x^2 + x^1 + x + 1
  // The x^8 term in implicit, hence 0x2F
  // This ensures detection of 1, 2, and 3 bit errors
  SPI2->CRCPR = 0x2F;
  // Enable CRC
  SPI2->CR1 |= SPI_CR1_CRCEN;

  // Enable RX and TX DMA requests when RXNE and TXE flags are set
  SPI2->CR2 = SPI_CR2_RXDMAEN | SPI_CR2_TXDMAEN;

  // Enable SPI peripheral
  SPI2->CR1 |= SPI_CR1_SPE;

  // Set SPI pins to alternate function mode
  // [RM0090 8.3.7 & Figure 26, DS8626 Table 7]
  GPIOB->MODER &= ~(GPIO_MODER_MODE12_Msk | GPIO_MODER_MODE13_Msk |
                    GPIO_MODER_MODE14_Msk | GPIO_MODER_MODE15_Msk);
  GPIOB->MODER |= (2 << GPIO_MODER_MODE12_Pos) | (2 << GPIO_MODER_MODE13_Pos) |
                  (2 << GPIO_MODER_MODE14_Pos) | (2 << GPIO_MODER_MODE15_Pos);

  // Set SPI pins to correct alternate funciton
  // [RM0090 8.3.7 & Figure 26, DS8626 Table 7]
  GPIOB->AFR[1] &= ~(GPIO_MODER_MODE12_Msk | GPIO_AFRH_AFSEL13_Msk |
                     GPIO_AFRH_AFSEL14_Msk | GPIO_AFRH_AFSEL15_Msk);
  GPIOB->AFR[1] |= (5 << GPIO_AFRH_AFSEL12_Pos) | (5 << GPIO_AFRH_AFSEL13_Pos) |
                   (5 << GPIO_AFRH_AFSEL14_Pos) | (5 << GPIO_AFRH_AFSEL15_Pos);

  // Set SPI pins OSPEEDR value to medium [DS8626 Table 50]
  GPIOB->OSPEEDR &= ~(GPIO_OSPEEDR_OSPEED12_Msk | GPIO_OSPEEDR_OSPEED13_Msk |
                      GPIO_OSPEEDR_OSPEED14_Msk | GPIO_OSPEEDR_OSPEED15_Msk);
  GPIOB->OSPEEDR |=
      (1 << GPIO_OSPEEDR_OSPEED12_Pos) | (1 << GPIO_OSPEEDR_OSPEED13_Pos) |
      (1 << GPIO_OSPEEDR_OSPEED14_Pos) | (1 << GPIO_OSPEEDR_OSPEED15_Pos);

I also completed the RX DMA Transfer Complete Interrupt, which is responsible for checking the hardware CRC calculation for errors, clearing the CRC register, resetting the CRC and SPI peripherals after each transfer, and preforming the calculations necessary to transform the raw DShot command/throttle data into CRC frames and then into the correct CCR values. This one took a decent bit of staring at the datasheet, since the interrupt triggers after the last data byte is transferred by DMA, not after the CRC byte is received, and clear the acknowledge the interrupt, wait for the CRC byte, clear the CRC, check for errors, and then begin processing. After processing we switch the CCR buffer front value, so the timers begin to read from that at the next cycle.

We also need to reset the CRC, which according to the datasheet should only be done after the NSS kine is pulled high, so we spin-wait for this after the processing.

Lastly we re-configure and re-enable the DMA.

typedef union {
  uint16_t words[8];
  uint8_t bytes[16];
} motor_frame_t;

static volatile motor_frame_t cmd_dshot_buf;
static volatile motor_frame_t erpm_dshot_buf;

// buffer is 11 bytes (11 bit data * 8 motors)
static volatile uint8_t cmd_data_buf[11];
static volatile uint8_t erpm_data_buf[11];

static volatile uint32_t cmd_ccr_tim1_buf[2][16][4];
static volatile uint32_t cmd_ccr_tim8_buf[2][16][4];
static volatile uint8_t cmd_ccr_front;

void DMA1_Stream3_IRQHandler(void) {
  // Clear transfer complete interrupt flag
  DMA1->LIFCR = DMA_LIFCR_CTCIF3;

  // Wait for CRC byte to transfer
  while (!(SPI2->SR & SPI_SR_RXNE));
  (void)SPI2->DR; // Flush CRC

  bool crc_err = SPI2->SR & SPI_SR_CRCERR;
  if (crc_err) {
    SPI2->SR &= ~SPI_SR_CRCERR;
  } else {
    volatile uint16_t *words = cmd_dshot_buf.words;

    unpack_11bit(cmd_data_buf, words);

    build_dshot_frames(words, cmd_ccr_tim1_buf[cmd_ccr_front ^ 1],
                       cmd_ccr_tim8_buf[cmd_ccr_front ^ 1]);

    // switch the double buffer
    cmd_ccr_front ^= 1;

    // Wait for NSS = high
    while (SPI2->SR & SPI_SR_BSY);

    // Now we can clear CRC [RM0090 28.3.6]
    SPI2->CR1 &= ~SPI_CR1_SPE;
    SPI2->CR1 &= ~SPI_CR1_CRCEN;
    SPI2->CR1 |= SPI_CR1_CRCEN;
  }

  // Reconfigure + Re-enable the DMA
  DMA1_Stream3->NDTR = 11;
  DMA1_Stream4->NDTR = 11;

  DMA1_Stream3->CR |= DMA_SxCR_EN;
  DMA1_Stream4->CR |= DMA_SxCR_EN;

  // Re-enable the SPI peripheral (does nothing if crc_err = true)
  SPI2->CR1 |= SPI_CR1_SPE;
}

unpack_11bit() simply unpacks the 8 continuous 11 bit values into 8 unsigned 16 bit values.

build_dshot_frames() is a bit more complicated:

#define DSHOT_BIT1_CCR 210 // 280 * 0.75 = 210
#define DSHOT_BIT0_CCR 105 // 280 * 0.375 = 105

static inline uint8_t dshot_crc(uint16_t data) {
  return (~(data ^ (data >> 4) ^ (data >> 8))) & 0x0F;
}

void build_dshot_frames(
  const uint16_t words[8], 
  uint32_t tim1_buf[16][4],
  uint32_t tim8_buf[16][4]
) {
  for (uint8_t ch = 0; ch < 4; ++ch) {
    uint16_t tim1_data = words[ch];
    uint16_t tim8_data = words[ch + 4];

    uint8_t tim1_crc = dshot_crc(tim1_data << 5);
    uint8_t tim8_crc = dshot_crc(tim8_data << 5);

    // Construct frame fields and reverse bit order (to MSB)
    // Also place the CCR high period in the buffers.

    // Data bits 5-16 -> buffer positions 1-11
    for (uint8_t i = 0; i < 11; ++i) {
      tim1_buf[i][ch] =
          (tim1_data >> (10 - i)) & 1 ? DSHOT_BIT1_CCR : DSHOT_BIT0_CCR;
      tim8_buf[i][ch] =
          (tim8_data >> (10 - i)) & 1 ? DSHOT_BIT1_CCR : DSHOT_BIT0_CCR;
    }

    // Telemetry bit 5 -> buffer position 12
    tim1_buf[11][ch] = DSHOT_BIT0_CCR;
    tim8_buf[11][ch] = DSHOT_BIT0_CCR;

    // CRC bits 0-4 -> buffer position 13-16
    for (uint8_t i = 0; i < 4; ++i) {
      tim1_buf[12 + i][ch] =
          (tim1_crc >> (3 - i)) & 1 ? DSHOT_BIT1_CCR : DSHOT_BIT0_CCR;
      tim8_buf[12 + i][ch] =
          (tim8_crc >> (3 - i)) & 1 ? DSHOT_BIT1_CCR : DSHOT_BIT0_CCR;
    }
  }
}

The function, for each channels, grabs its associated word and calculates its corresponding CRC value. It then constructs the CCR register buffer by reversing the bit order to MSB and translating from binary to CCR register value.

I wish I made more progress this week, but I also had very limited working hours, and so I’m left with what is presented here. As seems to be a common theme for this project, the actual ‘code’ itself takes very little time to write. (I wrote the build_dshot_frames() function in less than 10 minutes) but ensuring that every reset and register access is in place is much more time consuming.