Optimizations for FFT DSP Process STM32H730 (CORDIC, ETC)
Hi, I am working on a project doing realtime processing of FFT information on an audio signal on the STM32H730. In order to get better realtime performance I would like to optimize this process as much as possible.
What would you recommend to optimize the following code block? Is there anywhere I can replace math operations with CMSIS or CORDIC functions, and/or just write something to be more computationally efficient?
float32_t wrap_phase(float32_t phase_in){
if (phase_in >= 0){ return fmodf(phase_in + M_PI, 2.0 * M_PI) - M_PI; }
else{ return fmodf(phase_in - M_PI, -2.0 * M_PI) + M_PI; }
}
void pv_analyze(void){
for(int i = 0; i < fft_size; ++i){
fft_in[i] = input_ring[(i + index) % fft_size] * window[i];
}
arm_rfft_fast_f32(&fftHandler, fft_in, fft_out, 0);
for(int i = 0; i < fft_size; i+=2){
float32_t amp = sqrtf((fft_out[i] * fft_out[i]) + (fft_out[i+1] * fft_out[i+1]));
float32_t phase = atan2f(fft_out[i+1], fft_out[i]);
float32_t phase_diff = phase - last_phase_in[i]; //no divide for simplicity
float32_t bin_center_freq = M_PI * (float32_t)i / (float32_t)fft_size; //(2*pi*n/fft_size) i = n * 2 for arm fft format
phase_diff = wrap_phase(phase_diff - bin_center_freq * (float32_t)hop_size);
float32_t bin_deviation = phase_diff * (float32_t)fft_size / (float32_t)hop_size / (2.0 * M_PI);
pv_in[i+1] = (float32_t)(i/2) + bin_deviation;
pv_in[i] = amp;
last_phase_in[i] = phase;
}
}
void pv_process(void){
memset(pv_out, 0, fft_size * sizeof(float32_t));
float32_t shift = pitch_shift.get();
for(int i = 0; i < fft_size; i += 2){
int n = i / 2;
int new_bin = int(n * shift + 0.5) * 2;
if(new_bin > 0 && new_bin < fft_size){
if(pv_out[new_bin] < pv_in[i]){
pv_out[new_bin+1] = pv_in[i+1] * shift;
pv_out[new_bin] = pv_in[i];
}
}
}
}
void pv_synthesize(void){
for(int i = 0; i < fft_size; i+=2){
int n = i / 2;
float32_t amp = pv_out[i];
float32_t bin_deviation = pv_out[i+1] - (float32_t)n;
float32_t phase_diff = bin_deviation * 2.0 * M_PI * (float32_t)hop_size / (float32_t)fft_size;
float32_t bin_center_freq = M_PI * (float32_t)i / (float32_t)fft_size;
phase_diff += bin_center_freq * (float32_t)hop_size;
float32_t out_phase = wrap_phase(last_phase_out[i] + phase_diff);
last_phase_out[i] = out_phase;
fft_in[i] = amp * cosf(out_phase);
fft_in[i+1] = amp * sinf(out_phase);
}
arm_rfft_fast_f32(&fftHandler, fft_in, fft_out, 1);
for(int i = 0; i < fft_size; i++){
output_ring[(i+index)%fft_size] += fft_out[i] * window[i] * 0.5;
}
}
Please advise. Thanks!
