Snippets

David Macias Using AWS Transcribe to get IVR prompts verbiage

Created by David Macias
from __future__ import print_function
from botocore.exceptions import ClientError

import boto3
import uuid
import logging
import sys
import os
import time
import json
import urllib.request
import pandas

local_directory = 'French/'
file_extension = '.wav'
media_format = 'wav'
language_code = 'fr-CA'

def create_unique_bucket_name(bucket_prefix):
    # The generated bucket name must be between 3 and 63 chars long
    return ''.join([bucket_prefix, str(uuid.uuid4())])

def create_bucket(bucket_prefix, s3_connection):
    session = boto3.session.Session()
    current_region = session.region_name
    bucket_name = create_unique_bucket_name(bucket_prefix)
    bucket_response = s3_connection.create_bucket(
        Bucket=bucket_name,
    )
    # print(bucket_name, current_region)
    return bucket_name, bucket_response

def delete_all_objects(bucket_name):
    res = []
    bucket = s3Resource.Bucket(bucket_name)
    for obj_version in bucket.object_versions.all():
        res.append({'Key': obj_version.object_key,
                    'VersionId': obj_version.id})
    # print(res)
    bucket.delete_objects(Delete={'Objects': res})

s3Client = boto3.client('s3')
s3Resource = boto3.resource('s3')
transcribe = boto3.client('transcribe')
data_frame =  pandas.DataFrame()

# Create bucket
bucket_name, first_response = create_bucket(
    bucket_prefix = 'transcription-',
    s3_connection = s3Client)

print("Bucket created %s" % bucket_name)

print("Checking bucket.")
for bucket in s3Resource.buckets.all():
    if bucket.name == bucket_name:
        print("Bucket ready.")
        good_to_go = True

if not good_to_go:
    print("Error with bucket.")
    quit()

# enumerate local files recursively
for root, dirs, files in os.walk(local_directory):
    for filename in files:
        if filename.endswith(file_extension):
            # construct the full local path
            local_path = os.path.join(root, filename)
            print("Local path: %s" % local_path)
            # construct the full Dropbox path
            relative_path = os.path.relpath(local_path, local_directory)
            print("File name: %s" % relative_path)
            s3_path = local_path
            print("Searching for %s in bucket %s" % (s3_path, bucket_name))
            try:
                s3Client.head_object(Bucket=bucket_name, Key=s3_path)
                print("Path found on bucket. Skipping %s..." % s3_path)
            except:
                print("Uploading %s..." % s3_path)
                s3Client.upload_file(local_path, bucket_name, s3_path)
                job_name = relative_path 
                job_uri = "https://%s.s3.amazonaws.com/%s" % (
                    bucket_name, s3_path)
                transcribe.start_transcription_job(
                    TranscriptionJobName=job_name,
                    Media={'MediaFileUri': job_uri},
                    MediaFormat=media_format,
                    LanguageCode=language_code
                )
                while True:
                    status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
                    if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
                        break
                    print('Transcription ' + status['TranscriptionJob']['TranscriptionJobStatus'])
                    time.sleep(25)
                print('Transcription ' + status['TranscriptionJob']['TranscriptionJobStatus'])
                response = urllib.request.urlopen(status['TranscriptionJob']['Transcript']['TranscriptFileUri'])
                data = json.loads(response.read())
                text = data['results']['transcripts'][0]['transcript']
                print("%s, %s "%(job_name, text))
                data_frame = data_frame.append({"Prompt Name":job_name, "Verbiage":text}, ignore_index=True)
                print("Deleting transcription job.")
                status = transcribe.delete_transcription_job(TranscriptionJobName=job_name)

#Create csv
print("Writing CSV")
data_frame.to_csv('prompts.csv', index=False)

# Empty bucket
print("Emptying bucket.")
delete_all_objects(bucket_name)

# Delete empty bucket
s3Resource.Bucket(bucket_name).delete()
print("Bucket deleted.")

Comments (0)