1. Sam Foster
  2. iPhone TTS
  3. Issues
Issue #5 new

Here's the fix to make it work without writing out the wave data

Anonymous created an issue

From zigzag@nullriver.com:

-(void)speakText:(NSString )text { NSMutableString cleanString; cleanString = [NSMutableString stringWithString:@""]; if([text length] > 1) { int x = 0; while (x < [text length]) { unichar ch = [text characterAtIndex:x]; [cleanString appendFormat:@"%c", ch]; x++; } } if(cleanString == nil) { // string is empty cleanString = [NSMutableString stringWithString:@""]; } sound = flite_text_to_wave([cleanString UTF8String], voice);

// copy sound into soundData
NSMutableData * soundData = [NSMutableData data];
char * info;
int num_bytes, d_int;
short d_short;

// add RIFF header
info = "RIFF";
[soundData appendBytes:info length:sizeof(info)];
num_bytes = (cst_wave_num_samples(sound) * cst_wave_num_channels(sound) * sizeof(short)) + 8 + 16 + 12;
[soundData appendBytes:&num_bytes length:sizeof(num_bytes)];
info = "WAVE";
[soundData appendBytes:info length:sizeof(info)];
info = "fmt ";
[soundData appendBytes:info length:sizeof(info)];
num_bytes = 16;
[soundData appendBytes:&num_bytes length:sizeof(num_bytes)];
d_short = RIFF_FORMAT_PCM;
[soundData appendBytes:&d_short length:sizeof(d_short)];
d_short = cst_wave_num_channels(sound); /* number of channels */
[soundData appendBytes:&d_short length:sizeof(d_short)];
d_int = cst_wave_sample_rate(sound);  /* sample rate */
[soundData appendBytes:&d_int length:sizeof(d_int)];
d_int = (cst_wave_sample_rate(sound) * cst_wave_num_channels(sound) * sizeof(short)); /* average bytes per second */
[soundData appendBytes:&d_int length:sizeof(d_int)];
d_short = (cst_wave_num_channels(sound) * sizeof(short)); /* block align */
[soundData appendBytes:&d_short length:sizeof(d_short)];
d_short = 2 * 8; /* bits per sample */
[soundData appendBytes:&d_short length:sizeof(d_short)];
info = "data";
[soundData appendBytes:info length:sizeof(info)];
d_int = (cst_wave_num_channels(sound) * cst_wave_num_samples(sound) * sizeof(short)); /* bytes in data */
[soundData appendBytes:&d_int length:sizeof(d_int)];
[soundData appendData:[NSData dataWithBytes:sound->samples length:d_int]];

delete_wave(sound);

NSError * error;
self.audioPlayer = [[[AVAudioPlayer alloc] initWithData:soundData error:&error] autorelease];

if(!error) 
{
    [self.audioPlayer setDelegate:self];
    [self.audioPlayer prepareToPlay];
    [self.audioPlayer play];
} 
else 
{
    NSLog(@"AVAudioPlayer Error: %@", error.localizedDescription);
}

}

Comments (2)

  1. Anonymous

    also added threading and a queue:

    .h

    //
    //  FliteTTS.h
    //  iPhone Text To Speech based on Flite
    //
    //  Copyright (c) 2010 Sam Foster
    //
    //  Permission is hereby granted, free of charge, to any person obtaining a copy
    //  of this software and associated documentation files (the "Software"), to deal
    //  in the Software without restriction, including without limitation the rights
    //  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    //  copies of the Software, and to permit persons to whom the Software is
    //  furnished to do so, subject to the following conditions:
    //
    //  The above copyright notice and this permission notice shall be included in
    //  all copies or substantial portions of the Software.
    // 
    //  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    //  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    //  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    //  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    //  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    //  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    //  THE SOFTWARE.
    //
    //  Author: Sam Foster <samfoster@gmail.com> <http://cmang.org>
    //  Copyright 2010. All rights reserved.
    //
    
    #import <Foundation/Foundation.h>
    #import <AVFoundation/AVFoundation.h>
    #import "flite.h"
    
    @interface FliteTTS : NSObject <AVAudioPlayerDelegate> {
    	AVAudioPlayer* audioPlayer;
        cst_voice * voice;
        BOOL isSpeaking;
        NSMutableArray * speechQueue;
    }
    
    @property (retain) AVAudioPlayer * audioPlayer;
    @property BOOL isSpeaking;
    @property (retain) NSMutableArray * speechQueue;
    
    // Use these:
    - (void)speakText:(NSString *)text;
    - (void)speakIfNeeded;
    - (void)stopSpeaking;
    - (void)setPitch:(float)pitch variance:(float)variance speed:(float)speed;
    - (void)setVoice:(NSString *)voicename;
    - (NSData *)riffDataForCSTWave:(cst_wave *)wave;
    
    @end
    
    
    .m
    
    //
    //  FliteTTS.m
    //  iPhone Text To Speech based on Flite
    //
    //  Copyright (c) 2010 Sam Foster
    //
    //  Permission is hereby granted, free of charge, to any person obtaining a copy
    //  of this software and associated documentation files (the "Software"), to deal
    //  in the Software without restriction, including without limitation the rights
    //  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    //  copies of the Software, and to permit persons to whom the Software is
    //  furnished to do so, subject to the following conditions:
    //
    //  The above copyright notice and this permission notice shall be included in
    //  all copies or substantial portions of the Software.
    // 
    //  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    //  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    //  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    //  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    //  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    //  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    //  THE SOFTWARE.
    //
    //  Author: Sam Foster <samfoster@gmail.com> <http://cmang.org>
    //  Copyright 2010. All rights reserved.
    //
    
    #import "FliteTTS.h"
    
    
    cst_voice *register_cmu_us_kal();
    cst_voice *register_cmu_us_kal16();
    cst_voice *register_cmu_us_rms();
    cst_voice *register_cmu_us_awb();
    cst_voice *register_cmu_us_slt();
    
    @implementation FliteTTS
    
    @synthesize audioPlayer;
    @synthesize isSpeaking;
    @synthesize speechQueue;
    
    -(id)init 
    {
        if((self = [super init]))
        {
            self.isSpeaking = NO;
            self.speechQueue = [NSMutableArray array];
    
            flite_init();
            [self setVoice:@"cmu_us_kal"];
        }
    
        return self;
    }
    
    - (void)dealloc 
    {
    	if(self.audioPlayer) [self.audioPlayer stop];
        self.audioPlayer = nil;
        self.speechQueue = nil;
        
        [super dealloc];
    }
    
    -(void)speakText:(NSString *)text 
    {
        [self.speechQueue addObject:text];
        [self speakIfNeeded];
    }
    
    - (void)speakIfNeeded {
        if(self.isSpeaking || self.speechQueue.count == 0) return;
    
        self.isSpeaking = YES;
        
        NSString * text = [self.speechQueue objectAtIndex:0];
        [NSThread detachNewThreadSelector:@selector(_speakText:) toTarget:self withObject:text];
    
        [self.speechQueue removeObjectAtIndex:0];
    }
    
    -(void)_speakText:(NSString *)text 
    {
        NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init];
        
        NSMutableString * cleanString = [NSMutableString stringWithString:@""];
        
        if([text length] > 1)
        {
            int x = 0;
            while (x < [text length])
            {
                unichar ch = [text characterAtIndex:x];
                [cleanString appendFormat:@"%c", ch];
                x++;
            }
        }
    
        if(cleanString == nil)
        {	// string is empty
            cleanString = [NSMutableString stringWithString:@""];
        }
        
        cst_wave * wave = flite_text_to_wave([cleanString UTF8String], voice);
        
        // copy sound into soundData
        NSData * soundData = [self riffDataForCSTWave:wave];
        
        delete_wave(wave);
    
        NSError * error;
        self.audioPlayer = [[[AVAudioPlayer alloc] initWithData:soundData error:&error] autorelease];
    
        if(error) 
        {
            NSLog(@"AVAudioPlayer Error: %@", error.localizedDescription);
            self.isSpeaking = NO;
        }
        else 
        {
            [self.audioPlayer setDelegate:self];
            [self.audioPlayer prepareToPlay];
            [self.audioPlayer play];
        }
        
        [pool drain];
        
        [NSThread exit];
    }
    
    - (NSData *)riffDataForCSTWave:(cst_wave *)wave {
        NSMutableData * soundData = [NSMutableData data];
        int num_bytes, d_int;
        short d_short;
        
        // add RIFF header
        [soundData appendBytes:"RIFF" length:4];
        num_bytes = (cst_wave_num_samples(wave) * cst_wave_num_channels(wave) * sizeof(short)) + 8 + 16 + 12;
        [soundData appendBytes:&num_bytes length:sizeof(num_bytes)];
        [soundData appendBytes:"WAVEfmt " length:8];
        num_bytes = 16;
        [soundData appendBytes:&num_bytes length:sizeof(num_bytes)];
        d_short = RIFF_FORMAT_PCM;
        [soundData appendBytes:&d_short length:sizeof(d_short)];
        d_short = cst_wave_num_channels(wave); /* number of channels */
        [soundData appendBytes:&d_short length:sizeof(d_short)];
        d_int = cst_wave_sample_rate(wave);  /* sample rate */
        [soundData appendBytes:&d_int length:sizeof(d_int)];
        d_int = (cst_wave_sample_rate(wave) * cst_wave_num_channels(wave) * sizeof(short)); /* average bytes per second */
        [soundData appendBytes:&d_int length:sizeof(d_int)];
        d_short = (cst_wave_num_channels(wave) * sizeof(short)); /* block align */
        [soundData appendBytes:&d_short length:sizeof(d_short)];
        d_short = 2 * 8; /* bits per sample */
        [soundData appendBytes:&d_short length:sizeof(d_short)];
        [soundData appendBytes:"data" length:4];
        d_int = (cst_wave_num_channels(wave) * cst_wave_num_samples(wave) * sizeof(short)); /* bytes in data */
        [soundData appendBytes:&d_int length:sizeof(d_int)];
        [soundData appendData:[NSData dataWithBytes:wave->samples length:d_int]];
        
        return soundData;
    }
    
    -(void)setPitch:(float)pitch variance:(float)variance speed:(float)speed
    {
    	feat_set_float(voice->features,"int_f0_target_mean", pitch);
    	feat_set_float(voice->features,"int_f0_target_stddev", variance);
    	feat_set_float(voice->features,"duration_stretch", speed); 
    }
    
    -(void)setVoice:(NSString *)voicename
    {
    	if([voicename isEqualToString:@"cmu_us_kal"]) {
    		voice = register_cmu_us_kal();
    	}
    	else if([voicename isEqualToString:@"cmu_us_kal16"]) {
    		voice = register_cmu_us_kal16();
    	}
    	else if([voicename isEqualToString:@"cmu_us_rms"]) {
    		voice = register_cmu_us_rms();
    	}
    	else if([voicename isEqualToString:@"cmu_us_awb"]) {
    		voice = register_cmu_us_awb();
    	}
    	else if([voicename isEqualToString:@"cmu_us_slt"]) {
    		voice = register_cmu_us_slt();
    	}
    }
    
    -(void)stopSpeaking
    {
        [self.speechQueue removeAllObjects];
    	[self.audioPlayer stop];
        self.audioPlayer = nil;
        self.isSpeaking = NO;
    }
    
    - (void)audioPlayerDidFinishPlaying:(AVAudioPlayer *)player successfully:(BOOL)flag 
    {
        self.audioPlayer = nil;
        self.isSpeaking = NO;
    
        [self speakIfNeeded];
    }
    
    @end
    
  2. dharry

    I implemented the above and it works great, much faster than writting to file and playing that way, but still to slow for what I need.

    Have you done any more work on this? I have seen flite implementations where the playback is almost instentaneous, i'm not sure what voices though, I know Kal is pretty quick but I need a female voice so have been using SLT...

    Cheers, Dean

  3. Log in to comment