Program to record speech audio into RAM and then play it back, moving Billy Bass's mouth in sync with the speech.

Dependencies:   mbed

Remember Big Mouth Billy Bass?

I've made a simple demo program for him using the Freescale FRDM-KL25Z board. I've hooked up the digital I/O to his motor driver transistors and pushbutton switch.

This program records 1.8 seconds of speech audio from ADC input when the pushbutton is pressed, then plays the audio back with Billy Bass's mouth controlled so that it opens during vowel sounds.

The ADC input is driven from a microphone and preamplifier, via a capacitor and into a resistor divider connected to the +3.3V supply pin to provide mid-range biasing for the ADC signals.

The DAC output is connected to his audio amplifier input (to the trace that was connected to pin 10 of the controller IC). I had to provide a DC bias using the DAC to get the single transistor amplifier biased into proper operation.

For more on the method of vowel recognition, please see the paper: http://www.mirlab.org/conference_papers/International_Conference/ICASSP%201999/PDF/AUTHOR/IC991957.PDF

Y. Nishida, Y. Nakadai, Y. Suzuki, T. Sakurai, T. Kurokawa, and H. Sato. 1999.

Voice recognition focusing on vowel strings on a fixed-point 20-MIPS DSP board.

In Proceedings of the Acoustics, Speech, and Signal Processing, 1999. on 1999 IEEE International Conference - Volume 01 (ICASSP '99), Vol. 1. IEEE Computer Society, Washington, DC, USA, 137-140. DOI=10.1109/ICASSP.1999.758081 http://dx.doi.org/10.1109/ICASSP.1999.758081

Files at this revision

API Documentation at this revision

Comitter:
bikeNomad
Date:
Wed May 15 17:53:33 2013 +0000
Parent:
3:c04d8d0493f4
Child:
5:9f4ffb2b0e6b
Commit message:
tuned analyzer; added play with billy

Changed in this revision

AudioAnalyzer.h Show annotated file Show diff for this revision Revisions of this file
main.cpp Show annotated file Show diff for this revision Revisions of this file
--- a/AudioAnalyzer.h	Wed May 15 15:32:34 2013 +0000
+++ b/AudioAnalyzer.h	Wed May 15 17:53:33 2013 +0000
@@ -1,6 +1,8 @@
 #ifndef __included_audio_analyzer_h
 #define __included_audio_analyzer_h
 
+#include <math.h>
+
 namespace NK
 {
 
@@ -11,6 +13,8 @@
     uint16_t nsamples;
     uint16_t zeroCrossings;
     uint32_t power;
+    float logPower;
+    float powerRef;
     int8_t minValue;
     int8_t maxValue;
     bool analyzed;
@@ -19,7 +23,7 @@
 
 public:
     AudioAnalyzer(int8_t const *_samples, uint16_t _nsamples)
-        : samples(_samples), nsamples(_nsamples), zeroCrossings(0), power(0), analyzed(false) {
+        : samples(_samples), nsamples(_nsamples), zeroCrossings(0), power(0), logPower(0.0), powerRef(0.0), analyzed(false) {
     }
 
     uint16_t getZeroCrossings() {
@@ -27,16 +31,52 @@
         return zeroCrossings;
     }
 
+    float getZeroCrossingRatioPercent() {
+        return getZeroCrossings() * 100.0 / nsamples;
+    }
+
     uint32_t getPower() {
         if (!analyzed) analyze();
         return power;
     }
 
+    float getLogPower() {
+        if (!analyzed) analyze();
+        logPower = ::log((double)power) - powerRef;
+        return logPower;
+    }
+
     void getMinMaxValues(int8_t *min, int8_t *max) {
         if (!analyzed) analyze();
         *min = minValue;
         *max = maxValue;
     }
+
+    bool isVoiced() {
+        return !(isnan(getLogPower()) || logPower < PowerThreshold);
+    }
+
+    void setPowerRef(float _powerRef) {
+        powerRef = _powerRef;
+    }
+
+    // anything with logPower above PowerThreshold
+    // and below the line
+    // zeroCrossingRatioPercent = VowelSlope * logPower + VowelIntercept
+    bool isVowel() {
+        getLogPower();
+        if (logPower < PowerThreshold)
+            return false;
+        return (getZeroCrossingRatioPercent() < VowelSlope * (logPower - VowelXIntercept));
+    }
+
+    static const float PowerThreshold = -4.0;
+    // anything below the line
+    // zeroCrossingRatioPercent = VowelSlope * logPower + VowelIntercept
+    // and above PowerThreshold
+    // is considered a vowel.
+    static const float VowelSlope = 14.7;
+    static const float VowelXIntercept = -0.7;
 };
 
 } // namespace NK
--- a/main.cpp	Wed May 15 15:32:34 2013 +0000
+++ b/main.cpp	Wed May 15 17:53:33 2013 +0000
@@ -32,17 +32,23 @@
 Serial pc(USBTX, USBRX);
 
 const unsigned SAMPLE_RATE_HZ  = 7889;
-const unsigned SAMPLE_PERIOD_US     = (1000000U / SAMPLE_RATE_HZ);
+const unsigned SAMPLE_PERIOD_US  = (1000000U / SAMPLE_RATE_HZ);
 const unsigned SAMPLE_BUFFER_SIZE = 9000;
-const float CHUNK_DURATION = 0.08;
+const unsigned CHUNK_DURATION_MS = 80;
+const unsigned CHUNK_SIZE = SAMPLE_RATE_HZ  * CHUNK_DURATION_MS / 1000;
+const unsigned NUM_CHUNKS = SAMPLE_BUFFER_SIZE / CHUNK_SIZE;
 
 Ticker sampleTicker;
 Timer timer;
 
+// audio samples
 int8_t sampleBuffer[SAMPLE_BUFFER_SIZE];      // 1 second buffer
 int8_t * volatile nextSample;
 uint16_t volatile samplesRemaining;
 
+// vowel decisions
+bool vowels[ NUM_CHUNKS ];
+
 extern "C"
 void ADC0_IRQHandler(void)
 {
@@ -97,16 +103,14 @@
     pc.printf("Done. %u samples in %f usec = %f samples/sec\r\n", SAMPLE_BUFFER_SIZE, elapsed * 1.0e6, SAMPLE_BUFFER_SIZE / elapsed);
 }
 
-void playAudio(float duration, int8_t *start=sampleBuffer, uint16_t nsamples=SAMPLE_BUFFER_SIZE)
+void playAudio(unsigned duration_ms, int8_t *start=sampleBuffer, uint16_t nsamples=SAMPLE_BUFFER_SIZE)
 {
-    greenLED = 0.0;
     resetSampleBuffer(start, nsamples);
     timer.reset();
     timer.start();
-    sampleTicker.attach(&playAudioSample, duration/nsamples);
+    sampleTicker.attach_us(&playAudioSample, duration_ms*1000/nsamples);
     while (samplesRemaining) {
-        wait_us(50000);
-        greenLED.write(1.0 - (1.0 *  samplesRemaining / nsamples));
+        wait_us(CHUNK_DURATION_MS * 1000);
     }
 }
 
@@ -122,47 +126,57 @@
 }
 
 // returns true if chunk was louder than minimum
-bool analyzeChunk(int8_t  *chunkStart, uint16_t chunkSize, float powerRef)
+bool analyzeChunk(int8_t  *chunkStart, uint16_t CHUNK_SIZE, float powerRef, bool *pisvowel = 0)
 {
-    AudioAnalyzer analyzer(chunkStart, chunkSize);
+    AudioAnalyzer analyzer(chunkStart, CHUNK_SIZE);
     uint32_t power = analyzer.getPower();
     uint16_t zcs = analyzer.getZeroCrossings();
     int8_t min, max;
     analyzer.getMinMaxValues(&min, &max);
-    float logPower = ::log((double)power);
-    if (isnan(logPower) || logPower < 1.0) {
-        return false;
-    }
-    float zcRatio = (float)zcs / chunkSize;
-    pc.printf("%.2f\t%.2f\t%d\t%d\t", zcRatio*100, logPower-powerRef, min, max);
-    return true;
+    analyzer.setPowerRef(powerRef);
+    float logPower = analyzer.getLogPower();
+    float zcRatio = analyzer.getZeroCrossingRatioPercent();
+    pc.printf("%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t", zcRatio, logPower, zcRatio / (logPower - AudioAnalyzer::VowelXIntercept), min, max, analyzer.isVowel());
+    if (pisvowel)
+        *pisvowel = analyzer.isVowel();
+    return analyzer.isVoiced();
 }
 
 void analyze(bool playToo = false)
 {
-    uint16_t chunkSize = SAMPLE_RATE_HZ  * CHUNK_DURATION;
-    uint16_t nChunks = SAMPLE_BUFFER_SIZE / chunkSize;
     int8_t  *chunkStart = sampleBuffer;
     AudioAnalyzer analyzer(sampleBuffer, SAMPLE_BUFFER_SIZE);
     uint32_t power = analyzer.getPower();
     float powerRef = ::log((double)power);
     pc.printf("Reference power = %.2f\r\n", powerRef);
-    pc.printf("Analyzing %d chunks of %d samples (%.2f seconds):\r\n", nChunks, chunkSize, CHUNK_DURATION);
-    pc.printf("chunk\tstart\tzcratio\tlogp\tmin\tmax\tvowel\r\n");
-    for (uint16_t chunk = 0; chunk < nChunks; chunk++) {
-        pc.printf("%u\t%.2f\t", chunk, chunk * CHUNK_DURATION);
-        bool loudEnough = analyzeChunk(chunkStart, chunkSize, powerRef);
+    pc.printf("Analyzing %d chunks of %d samples (%.2f seconds):\r\n", NUM_CHUNKS, CHUNK_SIZE, CHUNK_DURATION_MS);
+    pc.printf("chunk\tstartms\tzcratio\tlogp\tmaxs\tmin\tmax\tisVowel\tvowel\r\n");
+    for (uint16_t chunk = 0; chunk < NUM_CHUNKS; chunk++) {
+        pc.printf("%u\t%u\t", chunk, chunk * CHUNK_DURATION_MS);
+        bool loudEnough = analyzeChunk(chunkStart, CHUNK_SIZE, powerRef, &vowels[chunk]);
         if (loudEnough) {
             if (playToo) {
                 while (! pc.readable())
-                    playAudio(CHUNK_DURATION, chunkStart, chunkSize);
+                    playAudio(CHUNK_DURATION_MS, chunkStart, CHUNK_SIZE);
                 int c = pc.getc();
                 pc.putc(c);
             } else
                 pc.puts("-");
         }
         pc.puts("\r\n");
-        chunkStart += chunkSize;
+        chunkStart += CHUNK_SIZE;
+    }
+}
+
+// assumes that vowels[] has been set by analyze
+void playWithBilly()
+{
+    int8_t  *chunkStart = sampleBuffer;
+    for (uint16_t chunk = 0; chunk < NUM_CHUNKS; chunk++) {
+        greenLED = vowels[chunk] ? 0.0 : 1.0;
+        playAudio(CHUNK_DURATION_MS, chunkStart, CHUNK_SIZE);
+        chunkStart += CHUNK_SIZE;
+
     }
 }
 
@@ -189,17 +203,19 @@
 
 #if 0
         audioTest();
-        playAudio(1.0);
+        playAudio(1000);
         analyze();
 #endif
 
         recordAudio();
         float duration = timer.read();
-        playAudio(duration);
+        // playAudio(duration * 1000);
         float elapsed = timer.read();
         pc.printf("Done. %u samples in %f usec = %f samples/sec", SAMPLE_BUFFER_SIZE, elapsed * 1.0e6, SAMPLE_BUFFER_SIZE / elapsed);
         pc.printf(" (Rate %#+0.2f%%)\r\n", (duration-elapsed)*100/duration);
-        analyze(true);
+        analyze(false);
         // dumpAudio();
+
+        playWithBilly();
     }
 }