Snippets
Created by
David Macias
last modified
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | from __future__ import print_function
import boto3
import uuid
import logging
import sys
import os
import time
import json
import urllib.request
import pandas
from botocore.exceptions import ClientError
from dotenv import load_dotenv
load_dotenv()
local_directory = 'sourcePrompts/'
file_extension = '.wav'
media_format = 'wav'
language_code = 'en-US'
def create_unique_bucket_name(bucket_prefix):
# The generated bucket name must be between 3 and 63 chars long
return ''.join([bucket_prefix, str(uuid.uuid4())])
def create_bucket(bucket_prefix, s3_connection):
bucket_name = create_unique_bucket_name(bucket_prefix)
if region == "us-east-1":
bucket_response = s3_connection.create_bucket(
Bucket=bucket_name,
)
else:
location = {"LocationConstraint": region}
bucket_response = s3_connection.create_bucket(
Bucket=bucket_name,
CreateBucketConfiguration=location
)
# print(bucket_name, current_region)
return bucket_name, bucket_response
def delete_all_objects(bucket_name):
res = []
bucket = s3Resource.Bucket(bucket_name)
for obj_version in bucket.object_versions.all():
res.append({'Key': obj_version.object_key,
'VersionId': obj_version.id})
# print(res)
if res == []:
print("Bucket is empty.")
return
print("Deleting objects in bucket.")
bucket.delete_objects(Delete={'Objects': res})
session = boto3.session.Session(profile_name='YOUR_PROFILE_NAME')
region = os.getenv("REGION")
print("Region: "+region)
s3Client = session.client('s3', region_name=region)
s3Resource = session.resource('s3')
transcribe = session.client('transcribe', region_name=region)
data_frame = pandas.DataFrame()
# Create bucket
bucket_name, first_response = create_bucket(
bucket_prefix = 'transcription-',
s3_connection = s3Client)
print("Bucket created: %s" % bucket_name)
print("Checking bucket.")
for bucket in s3Resource.buckets.all():
if bucket.name == bucket_name:
print("Bucket ready.")
good_to_go = True
if not good_to_go:
print("Error with bucket.")
quit()
# enumerate local files recursively
print("Checking files.")
for root, dirs, files in os.walk(local_directory):
for filename in files:
if filename.endswith(file_extension):
# construct the full local path
local_path = os.path.join(root, filename)
print("Local path: %s" % local_path)
# construct the full Dropbox path
relative_path = os.path.relpath(local_path, local_directory)
print("File name: %s" % relative_path)
s3_path = local_path
print("Searching for %s in bucket %s" % (s3_path, bucket_name))
try:
s3Client.head_object(Bucket=bucket_name, Key=s3_path)
print("Path found on bucket. Skipping %s..." % s3_path)
except:
print("File not found.")
print("Uploading %s..." % s3_path)
s3Client.upload_file(local_path, bucket_name, s3_path)
job_name = bucket_name+"_"+relative_path
print("Job Name: "+job_name)
job_uri = "s3://%s/%s" % (
bucket_name, s3_path)
print("Job URI: "+job_uri)
transcribe.start_transcription_job(
TranscriptionJobName=job_name,
Media={'MediaFileUri': job_uri},
MediaFormat=media_format,
LanguageCode=language_code)
while True:
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
break
print('Transcription ' + status['TranscriptionJob']['TranscriptionJobStatus'])
time.sleep(10)
if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED']:
response = urllib.request.urlopen(status['TranscriptionJob']['Transcript']['TranscriptFileUri'])
data = json.loads(response.read())
text = data['results']['transcripts'][0]['transcript']
print("%s, %s "%(job_name, text))
data_frame = data_frame.append({"Prompt Name":relative_path, "Verbiage":text}, ignore_index=True)
if status['TranscriptionJob']['TranscriptionJobStatus'] in ['FAILED']:
data_frame = data_frame.append({"Prompt Name":relative_path, "Verbiage":"ERROR"}, ignore_index=True)
print("Deleting transcription job.")
status = transcribe.delete_transcription_job(TranscriptionJobName=job_name)
#Create csv
print("Writing CSV")
data_frame.to_csv('prompts.csv', index=False)
# Empty bucket
print("Emptying bucket.")
delete_all_objects(bucket_name)
# Delete empty bucket
s3Resource.Bucket(bucket_name).delete()
print("Bucket deleted.")
|
Comments (0)
You can clone a snippet to your computer for local editing. Learn more.