Merge "Improve TTS engine audio buffer API"

author: Bjorn Bringert <bringert@android.com> 2011-04-15 07:28:40 -0700
committer: Android (Google) Code Review <android-gerrit@google.com> 2011-04-15 07:28:40 -0700
commit: 3ad604b3d8a3ae87ee3f7545677bacc8f11159c0 (patch)
tree: 7d22b8cbbb0b2fae1433449cf132b939bc1b8b0b
parent: 702acacfdadcb496130e6e1c8b427a47a1f35930 (diff)
parent: 71e0b4807797c602e7fc787d00d27c4f9c92a507 (diff)
3 files changed, 160 insertions, 33 deletions
diff --git a/core/java/android/speech/tts/FileSynthesisRequest.java b/core/java/android/speech/tts/FileSynthesisRequest.java
index 370ad5338c2a..6a9b2dc14de3 100644
--- a/core/java/android/speech/tts/FileSynthesisRequest.java
+++ b/core/java/android/speech/tts/FileSynthesisRequest.java
@@ -19,6 +19,7 @@ import android.media.AudioFormat;
 import android.util.Log;
 
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
@@ -32,6 +33,8 @@ class FileSynthesisRequest extends SynthesisRequest {
     private static final String TAG = "FileSynthesisRequest";
     private static final boolean DBG = false;
 
+    private static final int MAX_AUDIO_BUFFER_SIZE = 8192;
+
     private static final int WAV_HEADER_LENGTH = 44;
     private static final short WAV_FORMAT_PCM = 0x0001;
 
@@ -81,6 +84,11 @@ class FileSynthesisRequest extends SynthesisRequest {
     }
 
     @Override
+    public int getMaxBufferSize() {
+        return MAX_AUDIO_BUFFER_SIZE;
+    }
+
+    @Override
     public int start(int sampleRateInHz, int audioFormat, int channelCount) {
         if (DBG) {
             Log.d(TAG, "FileSynthesisRequest.start(" + sampleRateInHz + "," + audioFormat
@@ -152,8 +160,9 @@ class FileSynthesisRequest extends SynthesisRequest {
             try {
                 // Write WAV header at start of file
                 mFile.seek(0);
-                int fileLen = (int) mFile.length();
-                mFile.write(makeWavHeader(mSampleRateInHz, mAudioFormat, mChannelCount, fileLen));
+                int dataLength = (int) (mFile.length() - WAV_HEADER_LENGTH);
+                mFile.write(
+                        makeWavHeader(mSampleRateInHz, mAudioFormat, mChannelCount, dataLength));
                 closeFile();
                 return TextToSpeech.SUCCESS;
             } catch (IOException ex) {
@@ -164,8 +173,37 @@ class FileSynthesisRequest extends SynthesisRequest {
         }
     }
 
+    @Override
+    public int completeAudioAvailable(int sampleRateInHz, int audioFormat, int channelCount,
+            byte[] buffer, int offset, int length) {
+        synchronized (mStateLock) {
+            if (mStopped) {
+                if (DBG) Log.d(TAG, "Request has been aborted.");
+                return TextToSpeech.ERROR;
+            }
+        }
+        FileOutputStream out = null;
+        try {
+            out = new FileOutputStream(mFileName);
+            out.write(makeWavHeader(sampleRateInHz, audioFormat, channelCount, length));
+            out.write(buffer, offset, length);
+            return TextToSpeech.SUCCESS;
+        } catch (IOException ex) {
+            Log.e(TAG, "Failed to write to " + mFileName + ": " + ex);
+            return TextToSpeech.ERROR;
+        } finally {
+            try {
+                if (out != null) {
+                    out.close();
+                }
+            } catch (IOException ex) {
+                Log.e(TAG, "Failed to close " + mFileName + ": " + ex);
+            }
+        }
+    }
+
     private byte[] makeWavHeader(int sampleRateInHz, int audioFormat, int channelCount,
-            int fileLength) {
+            int dataLength) {
         // TODO: is AudioFormat.ENCODING_DEFAULT always the same as ENCODING_PCM_16BIT?
         int sampleSizeInBytes = (audioFormat == AudioFormat.ENCODING_PCM_8BIT ? 1 : 2);
         int byteRate = sampleRateInHz * sampleSizeInBytes * channelCount;
@@ -177,7 +215,7 @@ class FileSynthesisRequest extends SynthesisRequest {
         header.order(ByteOrder.LITTLE_ENDIAN);
 
         header.put(new byte[]{ 'R', 'I', 'F', 'F' });
-        header.putInt(fileLength - 8);  // RIFF chunk size
+        header.putInt(dataLength + WAV_HEADER_LENGTH - 8);  // RIFF chunk size
         header.put(new byte[]{ 'W', 'A', 'V', 'E' });
         header.put(new byte[]{ 'f', 'm', 't', ' ' });
         header.putInt(16);  // size of fmt chunk
@@ -188,7 +226,6 @@ class FileSynthesisRequest extends SynthesisRequest {
         header.putShort(blockAlign);
         header.putShort(bitsPerSample);
         header.put(new byte[]{ 'd', 'a', 't', 'a' });
-        int dataLength = fileLength - WAV_HEADER_LENGTH;
         header.putInt(dataLength);
 
         return headerBuf;
diff --git a/core/java/android/speech/tts/PlaybackSynthesisRequest.java b/core/java/android/speech/tts/PlaybackSynthesisRequest.java
index 15a4ee96ecc8..226701518805 100644
--- a/core/java/android/speech/tts/PlaybackSynthesisRequest.java
+++ b/core/java/android/speech/tts/PlaybackSynthesisRequest.java
@@ -78,6 +78,13 @@ class PlaybackSynthesisRequest extends SynthesisRequest {
         }
     }
 
+    @Override
+    public int getMaxBufferSize() {
+        // The AudioTrack buffer will be at least MIN_AUDIO_BUFFER_SIZE, so that should always be
+        // a safe buffer size to pass in.
+        return MIN_AUDIO_BUFFER_SIZE;
+    }
+
     // TODO: add a thread that writes to the AudioTrack?
     @Override
     public int start(int sampleRateInHz, int audioFormat, int channelCount) {
@@ -86,20 +93,6 @@ class PlaybackSynthesisRequest extends SynthesisRequest {
                     + "," + channelCount + ")");
         }
 
-        int channelConfig;
-        if (channelCount == 1) {
-            channelConfig = AudioFormat.CHANNEL_OUT_MONO;
-        } else if (channelCount == 2){
-            channelConfig = AudioFormat.CHANNEL_OUT_STEREO;
-        } else {
-            Log.e(TAG, "Unsupported number of channels: " + channelCount);
-            return TextToSpeech.ERROR;
-        }
-
-        int minBufferSizeInBytes
-                = AudioTrack.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat);
-        int bufferSizeInBytes = Math.max(MIN_AUDIO_BUFFER_SIZE, minBufferSizeInBytes);
-
         synchronized (mStateLock) {
             if (mStopped) {
                 if (DBG) Log.d(TAG, "Request has been aborted.");
@@ -111,22 +104,19 @@ class PlaybackSynthesisRequest extends SynthesisRequest {
                 return TextToSpeech.ERROR;
             }
 
-            mAudioTrack = new AudioTrack(mStreamType, sampleRateInHz, channelConfig, audioFormat,
-                    bufferSizeInBytes, AudioTrack.MODE_STREAM);
-            if (mAudioTrack.getState() != AudioTrack.STATE_INITIALIZED) {
-                cleanUp();
+            mAudioTrack = createAudioTrack(sampleRateInHz, audioFormat, channelCount,
+                    AudioTrack.MODE_STREAM);
+            if (mAudioTrack == null) {
                 return TextToSpeech.ERROR;
             }
-
-            setupVolume();
         }
 
         return TextToSpeech.SUCCESS;
     }
 
-    private void setupVolume() {
-        float vol = clip(mVolume, 0.0f, 1.0f);
-        float panning = clip(mPan, -1.0f, 1.0f);
+    private void setupVolume(AudioTrack audioTrack, float volume, float pan) {
+        float vol = clip(volume, 0.0f, 1.0f);
+        float panning = clip(pan, -1.0f, 1.0f);
         float volLeft = vol;
         float volRight = vol;
         if (panning > 0.0f) {
@@ -135,7 +125,7 @@ class PlaybackSynthesisRequest extends SynthesisRequest {
             volRight *= (1.0f + panning);
         }
         if (DBG) Log.d(TAG, "volLeft=" + volLeft + ",volRight=" + volRight);
-        if (mAudioTrack.setStereoVolume(volLeft, volRight) != AudioTrack.SUCCESS) {
+        if (audioTrack.setStereoVolume(volLeft, volRight) != AudioTrack.SUCCESS) {
             Log.e(TAG, "Failed to set volume");
         }
     }
@@ -148,7 +138,10 @@ class PlaybackSynthesisRequest extends SynthesisRequest {
     public int audioAvailable(byte[] buffer, int offset, int length) {
         if (DBG) {
             Log.d(TAG, "audioAvailable(byte[" + buffer.length + "],"
-                    + offset + "," + length + "), thread ID=" + android.os.Process.myTid());
+                    + offset + "," + length + ")");
+        }
+        if (length > getMaxBufferSize()) {
+            throw new IllegalArgumentException("buffer is too large (" + length + " bytes)");
         }
         synchronized (mStateLock) {
             if (mStopped) {
@@ -194,4 +187,72 @@ class PlaybackSynthesisRequest extends SynthesisRequest {
         }
         return TextToSpeech.SUCCESS;
     }
+
+    @Override
+    public int completeAudioAvailable(int sampleRateInHz, int audioFormat, int channelCount,
+            byte[] buffer, int offset, int length) {
+        if (DBG) {
+            Log.d(TAG, "completeAudioAvailable(" + sampleRateInHz + "," + audioFormat
+                    + "," + channelCount + "byte[" + buffer.length + "],"
+                    + offset + "," + length + ")");
+        }
+
+        synchronized (mStateLock) {
+            if (mStopped) {
+                if (DBG) Log.d(TAG, "Request has been aborted.");
+                return TextToSpeech.ERROR;
+            }
+            if (mAudioTrack != null) {
+                Log.e(TAG, "start() called before completeAudioAvailable()");
+                cleanUp();
+                return TextToSpeech.ERROR;
+            }
+
+            mAudioTrack = createAudioTrack(sampleRateInHz, audioFormat, channelCount,
+                    AudioTrack.MODE_STATIC);
+            if (mAudioTrack == null) {
+                return TextToSpeech.ERROR;
+            }
+
+            try {
+                mAudioTrack.write(buffer, offset, length);
+                mAudioTrack.play();
+            } catch (IllegalStateException ex) {
+                Log.e(TAG, "Playback error", ex);
+                return TextToSpeech.ERROR;
+            } finally {
+                cleanUp();
+            }
+        }
+
+        return TextToSpeech.SUCCESS;
+    }
+
+    private AudioTrack createAudioTrack(int sampleRateInHz, int audioFormat, int channelCount,
+            int mode) {
+        int channelConfig;
+        if (channelCount == 1) {
+            channelConfig = AudioFormat.CHANNEL_OUT_MONO;
+        } else if (channelCount == 2){
+            channelConfig = AudioFormat.CHANNEL_OUT_STEREO;
+        } else {
+            Log.e(TAG, "Unsupported number of channels: " + channelCount);
+            return null;
+        }
+
+        int minBufferSizeInBytes
+                = AudioTrack.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat);
+        int bufferSizeInBytes = Math.max(MIN_AUDIO_BUFFER_SIZE, minBufferSizeInBytes);
+        AudioTrack audioTrack = new AudioTrack(mStreamType, sampleRateInHz, channelConfig,
+                audioFormat, bufferSizeInBytes, mode);
+        if (audioTrack == null) {
+            return null;
+        }
+        if (audioTrack.getState() != AudioTrack.STATE_INITIALIZED) {
+            audioTrack.release();
+            return null;
+        }
+        setupVolume(audioTrack, mVolume, mPan);
+        return audioTrack;
+    }
 }
 \ No newline at end of file
diff --git a/core/java/android/speech/tts/SynthesisRequest.java b/core/java/android/speech/tts/SynthesisRequest.java
index 3f2ec5d746b7..f4bb85228901 100644
--- a/core/java/android/speech/tts/SynthesisRequest.java
+++ b/core/java/android/speech/tts/SynthesisRequest.java
@@ -18,6 +18,13 @@ package android.speech.tts;
 /**
  * A request for speech synthesis given to a TTS engine for processing.
  *
+ * The engine can provide streaming audio by calling
+ * {@link #start}, then {@link #audioAvailable} until all audio has been provided, then finally
+ * {@link #done}.
+ *
+ * Alternatively, the engine can provide all the audio at once, by using
+ * {@link #completeAudioAvailable}.
+ *
  * @hide Pending approval
  */
 public abstract class SynthesisRequest {
@@ -101,6 +108,12 @@ public abstract class SynthesisRequest {
     }
 
     /**
+     * Gets the maximum number of bytes that the TTS engine can pass in a single call of
+     * {@link #audioAvailable}. This does not apply to {@link #completeAudioAvailable}.
+     */
+    public abstract int getMaxBufferSize();
+
+    /**
      * Aborts the speech request.
      *
      * Can be called from multiple threads.
@@ -117,7 +130,7 @@ public abstract class SynthesisRequest {
      * @param sampleRateInHz Sample rate in HZ of the generated audio.
      * @param audioFormat Audio format of the generated audio. Must be one of
      *         the ENCODING_ constants defined in {@link android.media.AudioFormat}.
-     * @param channelCount The number of channels
+     * @param channelCount The number of channels. Must be {@code 1} or {@code 2}.
      * @return {@link TextToSpeech#SUCCESS} or {@link TextToSpeech#ERROR}.
      */
     public abstract int start(int sampleRateInHz, int audioFormat, int channelCount);
@@ -131,8 +144,8 @@ public abstract class SynthesisRequest {
      * @param buffer The generated audio data. This method will not hold on to {@code buffer},
      *         so the caller is free to modify it after this method returns.
      * @param offset The offset into {@code buffer} where the audio data starts.
-     * @param length The number of bytes of audio data in {@code buffer}.
-     *         Must be less than or equal to {@code buffer.length - offset}.
+     * @param length The number of bytes of audio data in {@code buffer}. This must be
+     *         less than or equal to the return value of {@link #getMaxBufferSize}.
      * @return {@link TextToSpeech#SUCCESS} or {@link TextToSpeech#ERROR}.
      */
     public abstract int audioAvailable(byte[] buffer, int offset, int length);
@@ -148,4 +161,20 @@ public abstract class SynthesisRequest {
      */
     public abstract int done();
 
+    /**
+     * The service can call this method instead of using {@link #start}, {@link #audioAvailable}
+     * and {@link #done} if all the audio data is available in a single buffer.
+     *
+     * @param sampleRateInHz Sample rate in HZ of the generated audio.
+     * @param audioFormat Audio format of the generated audio. Must be one of
+     *         the ENCODING_ constants defined in {@link android.media.AudioFormat}.
+     * @param channelCount The number of channels. Must be {@code 1} or {@code 2}.
+     * @param buffer The generated audio data. This method will not hold on to {@code buffer},
+     *         so the caller is free to modify it after this method returns.
+     * @param offset The offset into {@code buffer} where the audio data starts.
+     * @param length The number of bytes of audio data in {@code buffer}.
+     * @return {@link TextToSpeech#SUCCESS} or {@link TextToSpeech#ERROR}.
+     */
+    public abstract int completeAudioAvailable(int sampleRateInHz, int audioFormat,
+            int channelCount, byte[] buffer, int offset, int length);
 }
 \ No newline at end of file
author	Bjorn Bringert <bringert@android.com>	2011-04-15 07:28:40 -0700
committer	Android (Google) Code Review <android-gerrit@google.com>	2011-04-15 07:28:40 -0700
commit	3ad604b3d8a3ae87ee3f7545677bacc8f11159c0 (patch)
tree	7d22b8cbbb0b2fae1433449cf132b939bc1b8b0b
parent	702acacfdadcb496130e6e1c8b427a47a1f35930 (diff)
parent	71e0b4807797c602e7fc787d00d27c4f9c92a507 (diff)