miniDSP
A small C library for audio DSP
Loading...
Searching...
No Matches
minidsp_vad.c
Go to the documentation of this file.
1
5
6#include "minidsp.h"
7#include "minidsp_internal.h"
8
9/* -----------------------------------------------------------------------
10 * Feature extraction helpers (static)
11 * -----------------------------------------------------------------------*/
12
17static double compute_spectral_entropy(const double *psd, unsigned num_bins)
18{
19 double total = 0.0;
20 for (unsigned i = 0; i < num_bins; i++)
21 total += psd[i];
22
23 if (total <= 0.0)
24 return 1.0; /* no energy = maximally unstructured */
25
26 double entropy = 0.0;
27 for (unsigned i = 0; i < num_bins; i++) {
28 double p = psd[i] / total;
29 if (p > 0.0)
30 entropy -= p * log(p);
31 }
32
33 double max_entropy = log((double)num_bins);
34 if (max_entropy <= 0.0)
35 return 0.0;
36
37 return entropy / max_entropy;
38}
39
44static double compute_spectral_flatness(const double *psd, unsigned num_bins)
45{
46 double log_sum = 0.0;
47 double arith_sum = 0.0;
48
49 for (unsigned i = 0; i < num_bins; i++) {
50 double val = psd[i] > 0.0 ? psd[i] : 1e-30;
51 log_sum += log(val);
52 arith_sum += psd[i];
53 }
54
55 double arith_mean = arith_sum / (double)num_bins;
56 if (arith_mean <= 0.0)
57 return 1.0; /* no energy = maximally flat */
58
59 double log_geo_mean = log_sum / (double)num_bins;
60 double geo_mean = exp(log_geo_mean);
61
62 double flatness = geo_mean / arith_mean;
63 if (flatness > 1.0) flatness = 1.0;
64 if (flatness < 0.0) flatness = 0.0;
65
66 return flatness;
67}
68
73static double compute_band_energy_ratio(const double *psd, unsigned num_bins,
74 double sample_rate, unsigned N,
75 double band_low_hz, double band_high_hz)
76{
77 double freq_per_bin = sample_rate / (double)N;
78 double total = 0.0;
79 double band = 0.0;
80
81 for (unsigned i = 0; i < num_bins; i++) {
82 double freq = i * freq_per_bin;
83 total += psd[i];
84 if (freq >= band_low_hz && freq <= band_high_hz)
85 band += psd[i];
86 }
87
88 if (total <= 0.0)
89 return 0.0;
90
91 return band / total;
92}
93
94/* -----------------------------------------------------------------------
95 * Adaptive normalization helpers (static)
96 * -----------------------------------------------------------------------*/
97
98#define RANGE_FLOOR 1e-12
99
100static void update_normalization(MD_vad_state *state, const double *raw)
101{
102 double alpha = state->params.adaptation_rate;
103
104 for (int i = 0; i < MD_VAD_NUM_FEATURES; i++) {
105 if (state->frames_processed == 0) {
106 state->feat_min[i] = raw[i];
107 state->feat_max[i] = raw[i];
108 } else {
109 if (raw[i] < state->feat_min[i])
110 state->feat_min[i] = state->feat_min[i]
111 + alpha * (raw[i] - state->feat_min[i]);
112 if (raw[i] > state->feat_max[i])
113 state->feat_max[i] = state->feat_max[i]
114 + alpha * (raw[i] - state->feat_max[i]);
115 }
116 }
117}
118
119static void normalize_features(const MD_vad_state *state,
120 const double *raw, double *norm_out)
121{
122 for (int i = 0; i < MD_VAD_NUM_FEATURES; i++) {
123 double range = state->feat_max[i] - state->feat_min[i];
124 if (range < RANGE_FLOOR)
125 range = RANGE_FLOOR;
126
127 double val = (raw[i] - state->feat_min[i]) / range;
128 if (val < 0.0) val = 0.0;
129 if (val > 1.0) val = 1.0;
130 norm_out[i] = val;
131 }
132}
133
134/* -----------------------------------------------------------------------
135 * Extract all five raw features from one frame
136 * -----------------------------------------------------------------------*/
137
138static void extract_features(const double *signal, unsigned N,
139 double sample_rate,
140 double band_low_hz, double band_high_hz,
141 double *raw_out)
142{
143 raw_out[MD_VAD_FEAT_ENERGY] = MD_energy(signal, N);
144 raw_out[MD_VAD_FEAT_ZCR] = MD_zero_crossing_rate(signal, N);
145
146 unsigned num_bins = N / 2 + 1;
147 double psd[num_bins];
148 MD_power_spectral_density(signal, N, psd);
149
150 /* Invert entropy and flatness so that higher = more speech-like.
151 * Both are naturally high for noise (uniform/flat spectrum) and
152 * low for speech (structured harmonics). */
154 1.0 - compute_spectral_entropy(psd, num_bins);
156 1.0 - compute_spectral_flatness(psd, num_bins);
158 compute_band_energy_ratio(psd, num_bins, sample_rate, N,
159 band_low_hz, band_high_hz);
160}
161
162/* -----------------------------------------------------------------------
163 * Public API
164 * -----------------------------------------------------------------------*/
165
167{
168 MD_CHECK_VOID(params != NULL, MD_ERR_NULL_POINTER, "params is NULL");
169
170 /* Optimized VAD parameters (F2-optimized, recall-biased).
171 * Source: 300-trial Optuna search on LibriVAD train-clean-100,
172 * all noise types, all SNRs. F2 improved from 0.837 to 0.933. */
173 params->weights[MD_VAD_FEAT_ENERGY] = 0.723068;
174 params->weights[MD_VAD_FEAT_ZCR] = 0.063948;
175 params->weights[MD_VAD_FEAT_SPECTRAL_ENTROPY] = 0.005964;
176 params->weights[MD_VAD_FEAT_SPECTRAL_FLATNESS] = 0.048865;
177 params->weights[MD_VAD_FEAT_BAND_ENERGY_RATIO] = 0.158156;
178
179 params->threshold = 0.245332;
180 params->onset_frames = 1;
181 params->hangover_frames = 22;
182 params->adaptation_rate = 0.012755;
183 params->band_low_hz = 126.4;
184 params->band_high_hz = 2899.3;
185}
186
187void MD_vad_init(MD_vad_state *state, const MD_vad_params *params)
188{
189 MD_CHECK_VOID(state != NULL, MD_ERR_NULL_POINTER, "state is NULL");
190
191 if (params != NULL) {
192 state->params = *params;
193 } else {
195 }
196
197 for (int i = 0; i < MD_VAD_NUM_FEATURES; i++) {
198 state->feat_min[i] = 1e30;
199 state->feat_max[i] = -1e30;
200 }
201
202 state->onset_counter = 0;
203 state->hangover_counter = 0;
204 state->current_decision = 0;
205 state->frames_processed = 0;
206}
207
208void MD_vad_calibrate(MD_vad_state *state, const double *signal,
209 unsigned N, double sample_rate)
210{
211 MD_CHECK_VOID(state != NULL, MD_ERR_NULL_POINTER, "state is NULL");
212 MD_CHECK_VOID(signal != NULL, MD_ERR_NULL_POINTER, "signal is NULL");
213 MD_CHECK_VOID(N >= 2, MD_ERR_INVALID_SIZE, "N must be >= 2");
214
215 double raw[MD_VAD_NUM_FEATURES];
216 extract_features(signal, N, sample_rate,
217 state->params.band_low_hz, state->params.band_high_hz,
218 raw);
219
220 update_normalization(state, raw);
221 state->frames_processed++;
222}
223
224int MD_vad_process_frame(MD_vad_state *state, const double *signal,
225 unsigned N, double sample_rate,
226 double *score_out, double *features_out)
227{
228 MD_CHECK(state != NULL, MD_ERR_NULL_POINTER, "state is NULL", 0);
229 MD_CHECK(signal != NULL, MD_ERR_NULL_POINTER, "signal is NULL", 0);
230 MD_CHECK(N >= 2, MD_ERR_INVALID_SIZE, "N must be >= 2", 0);
231
232 /* 1. Extract raw features */
233 double raw[MD_VAD_NUM_FEATURES];
234 extract_features(signal, N, sample_rate,
235 state->params.band_low_hz, state->params.band_high_hz,
236 raw);
237
238 /* 2. Update adaptive normalization */
239 update_normalization(state, raw);
240
241 /* 3. Normalize features to [0, 1] */
242 double norm[MD_VAD_NUM_FEATURES];
243 normalize_features(state, raw, norm);
244
245 /* 4. Compute weighted score */
246 double score = 0.0;
247 for (int i = 0; i < MD_VAD_NUM_FEATURES; i++)
248 score += state->params.weights[i] * norm[i];
249
250 /* 5. Apply state machine */
251 if (score >= state->params.threshold) {
252 state->onset_counter++;
253 if (state->current_decision == 0) {
254 if (state->onset_counter >= state->params.onset_frames) {
255 state->current_decision = 1;
257 }
258 } else {
260 }
261 } else {
262 if (state->current_decision == 1) {
263 if (state->hangover_counter > 0) {
264 state->hangover_counter--;
265 }
266 if (state->hangover_counter == 0) {
267 state->current_decision = 0;
268 state->onset_counter = 0;
269 }
270 } else {
271 state->onset_counter = 0;
272 }
273 }
274
275 /* 6. Write optional outputs */
276 if (score_out != NULL)
277 *score_out = score;
278 if (features_out != NULL) {
279 for (int i = 0; i < MD_VAD_NUM_FEATURES; i++)
280 features_out[i] = norm[i];
281 }
282
283 /* 7. Increment counter, return decision */
284 state->frames_processed++;
285 return state->current_decision;
286}
A mini library of DSP (Digital Signal Processing) routines.
#define MD_VAD_NUM_FEATURES
Total number of features.
Definition minidsp.h:1794
void MD_power_spectral_density(const double *signal, unsigned N, double *psd_out)
Compute the power spectral density (PSD) of a real-valued signal.
@ MD_ERR_INVALID_SIZE
A size or count argument is invalid (e.g.
Definition minidsp.h:63
@ MD_ERR_NULL_POINTER
A required pointer argument is NULL.
Definition minidsp.h:62
double MD_zero_crossing_rate(const double *a, unsigned N)
Compute the zero-crossing rate of a signal.
#define MD_VAD_FEAT_ENERGY
Frame energy.
Definition minidsp.h:1789
#define MD_VAD_FEAT_ZCR
Zero-crossing rate.
Definition minidsp.h:1790
#define MD_VAD_FEAT_SPECTRAL_ENTROPY
Spectral entropy.
Definition minidsp.h:1791
double MD_energy(const double *a, unsigned N)
Compute signal energy: sum of squared samples.
#define MD_VAD_FEAT_BAND_ENERGY_RATIO
Band energy ratio.
Definition minidsp.h:1793
#define MD_VAD_FEAT_SPECTRAL_FLATNESS
Spectral flatness.
Definition minidsp.h:1792
Internal header for cross-file dependencies within the minidsp module.
#define MD_CHECK(cond, code, msg, retval)
Check a precondition in a function that returns a value.
#define MD_CHECK_VOID(cond, code, msg)
Check a precondition in a void function.
static double compute_spectral_entropy(const double *psd, unsigned num_bins)
Spectral entropy: normalize PSD to a probability distribution, return -sum(p * log(p)) / log(num_bins...
Definition minidsp_vad.c:17
void MD_vad_default_params(MD_vad_params *params)
Populate a VAD params struct with optimized defaults.
static double compute_band_energy_ratio(const double *psd, unsigned num_bins, double sample_rate, unsigned N, double band_low_hz, double band_high_hz)
Band energy ratio: sum of PSD bins in [band_low_hz, band_high_hz] divided by total PSD sum.
Definition minidsp_vad.c:73
int MD_vad_process_frame(MD_vad_state *state, const double *signal, unsigned N, double sample_rate, double *score_out, double *features_out)
Process one audio frame and return a binary speech decision.
static double compute_spectral_flatness(const double *psd, unsigned num_bins)
Spectral flatness: geometric mean / arithmetic mean of PSD bins.
Definition minidsp_vad.c:44
void MD_vad_calibrate(MD_vad_state *state, const double *signal, unsigned N, double sample_rate)
Feed a known-silence frame to seed the adaptive normalization.
void MD_vad_init(MD_vad_state *state, const MD_vad_params *params)
Initialize VAD state from params.
Parameters for the VAD detector.
Definition minidsp.h:1803
double threshold
Decision threshold (0.0–1.0).
Definition minidsp.h:1805
double weights[MD_VAD_NUM_FEATURES]
Per-feature weights for scoring.
Definition minidsp.h:1804
double band_high_hz
Upper bound of speech band (Hz).
Definition minidsp.h:1810
unsigned onset_frames
Consecutive above-threshold frames before speech.
Definition minidsp.h:1806
double band_low_hz
Lower bound of speech band (Hz).
Definition minidsp.h:1809
double adaptation_rate
EMA rate for min/max tracking (0.0–1.0).
Definition minidsp.h:1808
unsigned hangover_frames
Extra speech frames after score drops.
Definition minidsp.h:1807
Internal state for the VAD detector.
Definition minidsp.h:1819
unsigned onset_counter
Consecutive above-threshold count.
Definition minidsp.h:1823
double feat_max[MD_VAD_NUM_FEATURES]
EMA-tracked feature maximums.
Definition minidsp.h:1822
MD_vad_params params
Copy of caller params.
Definition minidsp.h:1820
int current_decision
Current speech decision (0 or 1).
Definition minidsp.h:1825
double feat_min[MD_VAD_NUM_FEATURES]
EMA-tracked feature minimums.
Definition minidsp.h:1821
unsigned hangover_counter
Remaining hangover frames.
Definition minidsp.h:1824
unsigned frames_processed
Total frames seen.
Definition minidsp.h:1826