Snippets

David Macias Using AWS Transcribe to get IVR prompts verbiage

Created by David Macias last modified
REGION=us-west-2
from __future__ import print_function

import boto3
import uuid
import logging
import sys
import os
import time
import json
import urllib.request
import pandas

from botocore.exceptions import ClientError
from dotenv import load_dotenv

load_dotenv()

local_directory = 'sourcePrompts/'
file_extension = '.wav'
media_format = 'wav'
language_code = 'en-US'

def create_unique_bucket_name(bucket_prefix):
    # The generated bucket name must be between 3 and 63 chars long
    return ''.join([bucket_prefix, str(uuid.uuid4())])

def create_bucket(bucket_prefix, s3_connection):
    bucket_name = create_unique_bucket_name(bucket_prefix)
    if region == "us-east-1":
        bucket_response = s3_connection.create_bucket(
            Bucket=bucket_name,
        )
    else:
        location = {"LocationConstraint": region}
        bucket_response = s3_connection.create_bucket(
            Bucket=bucket_name,
            CreateBucketConfiguration=location
        )

    # print(bucket_name, current_region)
    return bucket_name, bucket_response

def delete_all_objects(bucket_name):
    res = []
    bucket = s3Resource.Bucket(bucket_name)
    for obj_version in bucket.object_versions.all():
        res.append({'Key': obj_version.object_key,
                    'VersionId': obj_version.id})
    # print(res)
    if res == []:
        print("Bucket is empty.")
        return

    print("Deleting objects in bucket.")
    bucket.delete_objects(Delete={'Objects': res})

session = boto3.session.Session(profile_name='YOUR_PROFILE_NAME')

region = os.getenv("REGION")
print("Region: "+region)
s3Client = session.client('s3', region_name=region)
s3Resource = session.resource('s3')
transcribe = session.client('transcribe', region_name=region)
data_frame =  pandas.DataFrame()

# Create bucket
bucket_name, first_response = create_bucket(
    bucket_prefix = 'transcription-',
    s3_connection = s3Client)

print("Bucket created: %s" % bucket_name)

print("Checking bucket.")
for bucket in s3Resource.buckets.all():
    if bucket.name == bucket_name:
        print("Bucket ready.")
        good_to_go = True

if not good_to_go:
    print("Error with bucket.")
    quit()

# enumerate local files recursively
print("Checking files.")
for root, dirs, files in os.walk(local_directory):
    for filename in files:
        if filename.endswith(file_extension):
            # construct the full local path
            local_path = os.path.join(root, filename)
            print("Local path: %s" % local_path)
            # construct the full Dropbox path
            relative_path = os.path.relpath(local_path, local_directory)
            print("File name: %s" % relative_path)
            s3_path = local_path
            print("Searching for %s in bucket %s" % (s3_path, bucket_name))
            try:
                s3Client.head_object(Bucket=bucket_name, Key=s3_path)
                print("Path found on bucket. Skipping %s..." % s3_path)
            except:
                print("File not found.")
                print("Uploading %s..." % s3_path)
                s3Client.upload_file(local_path, bucket_name, s3_path)
                job_name = bucket_name+"_"+relative_path 
                print("Job Name: "+job_name)
                job_uri = "s3://%s/%s" % (
                    bucket_name, s3_path)
                print("Job URI: "+job_uri)
                transcribe.start_transcription_job(
                    TranscriptionJobName=job_name,
                    Media={'MediaFileUri': job_uri},
                    MediaFormat=media_format,
                    LanguageCode=language_code)
                while True:
                    status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
                    if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
                        break
                    print('Transcription ' + status['TranscriptionJob']['TranscriptionJobStatus'])
                    time.sleep(10)
                if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED']:
                    response = urllib.request.urlopen(status['TranscriptionJob']['Transcript']['TranscriptFileUri'])
                    data = json.loads(response.read())
                    text = data['results']['transcripts'][0]['transcript']
                    print("%s, %s "%(job_name, text))
                    data_frame = data_frame.append({"Prompt Name":relative_path, "Verbiage":text}, ignore_index=True)
                if status['TranscriptionJob']['TranscriptionJobStatus'] in ['FAILED']:
                    data_frame = data_frame.append({"Prompt Name":relative_path, "Verbiage":"ERROR"}, ignore_index=True)
                print("Deleting transcription job.")
                status = transcribe.delete_transcription_job(TranscriptionJobName=job_name)

#Create csv
print("Writing CSV")
data_frame.to_csv('prompts.csv', index=False)

# Empty bucket
print("Emptying bucket.")
delete_all_objects(bucket_name)

# Delete empty bucket
s3Resource.Bucket(bucket_name).delete()
print("Bucket deleted.")

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.