diff --git a/speech/cloud-client/requirements.txt b/speech/cloud-client/requirements.txt index 87b74e0d733..24207ec7414 100644 --- a/speech/cloud-client/requirements.txt +++ b/speech/cloud-client/requirements.txt @@ -1 +1,2 @@ -google-cloud-speech==0.33.0 +google-cloud-speech==0.34.0 +google-cloud-storage==1.10.0 diff --git a/speech/cloud-client/transcribe_async.py b/speech/cloud-client/transcribe_async.py index 0f1bb558535..339706d6525 100644 --- a/speech/cloud-client/transcribe_async.py +++ b/speech/cloud-client/transcribe_async.py @@ -1,3 +1,4 @@ + #!/usr/bin/env python # Copyright 2017 Google Inc. All Rights Reserved. @@ -23,70 +24,110 @@ """ import argparse +import csv +import datetime import io +import os +import subprocess +from google.cloud import speech_v1p1beta1 as speech +# from google.cloud import speech +from google.cloud import storage -# [START def_transcribe_file] -def transcribe_file(speech_file): - """Transcribe the given audio file asynchronously.""" - from google.cloud import speech - from google.cloud.speech import enums - from google.cloud.speech import types - client = speech.SpeechClient() - - # [START migration_async_request] - with io.open(speech_file, 'rb') as audio_file: - content = audio_file.read() +UPLOAD_BUCKET_NAME = 'bjoeris-temp-audio' - audio = types.RecognitionAudio(content=content) - config = types.RecognitionConfig( - encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, - sample_rate_hertz=16000, - language_code='en-US') +def _safe_filename(filename): + """ + Generates a safe filename that is unlikely to collide with existing objects + in Google Cloud Storage. + ``filename.ext`` is transformed into ``filename-YYYY-MM-DD-HHMMSS.ext`` + """ + date = datetime.datetime.utcnow().strftime("%Y-%m-%d-%H%M%S") + basename, extension = os.path.splitext(os.path.basename(filename)) + return "{0}-{1}.{2}".format(basename, date, extension) - # [START migration_async_response] - operation = client.long_running_recognize(config, audio) - # [END migration_async_request] - - print('Waiting for operation to complete...') - response = operation.result(timeout=90) - - # Each result is for a consecutive portion of the audio. Iterate through - # them to get the transcripts for the entire audio file. - for result in response.results: - # The first alternative is the most likely one for this portion. - print(u'Transcript: {}'.format(result.alternatives[0].transcript)) - print('Confidence: {}'.format(result.alternatives[0].confidence)) - # [END migration_async_response] +# [START def_transcribe_file] +def transcribe_file(filename, output): + """Transcribe the given audio file asynchronously.""" + client = storage.Client() + + print("Converting file...") + filename = transcode_file(filename) + + bucket_name = UPLOAD_BUCKET_NAME + bucket = client.bucket(bucket_name) + blob_name = _safe_filename(filename) + blob = bucket.blob(blob_name) + uri = "gs://{}/{}".format(bucket_name, blob_name) + print("Uploading file...", uri) + with io.open(filename, 'rb') as audio_file: + blob.upload_from_file(audio_file) + + operation = transcribe_gcs(uri, output) + def callback(operation_future): + print("Deleting file...") + blob.delete() + operation.add_done_callback(callback) + return operation # [END def_transcribe_file] +def transcode_file(filename): + stripped_name, ext = os.path.splitext(filename) + output = '{}-transcode.flac'.format(stripped_name) + subprocess.run(['ffmpeg', '-i', filename, '-ac', '1', '-ar', '48000', '-acodec', 'flac', output]) + print("transcoded: ", output) + return output + -# [START def_transcribe_gcs] -def transcribe_gcs(gcs_uri): +def transcribe_gcs(gcs_uri, output): """Asynchronously transcribes the audio file specified by the gcs_uri.""" - from google.cloud import speech - from google.cloud.speech import enums - from google.cloud.speech import types client = speech.SpeechClient() - audio = types.RecognitionAudio(uri=gcs_uri) - config = types.RecognitionConfig( - encoding=enums.RecognitionConfig.AudioEncoding.FLAC, - sample_rate_hertz=16000, - language_code='en-US') + audio = speech.types.RecognitionAudio(uri=gcs_uri) - operation = client.long_running_recognize(config, audio) + metadata = speech.types.RecognitionMetadata() + metadata.interaction_type = speech.enums.RecognitionMetadata.InteractionType.DISCUSSION + metadata.microphone_distance = speech.enums.RecognitionMetadata.MicrophoneDistance.NEARFIELD + metadata.recording_device_type = speech.enums.RecognitionMetadata.RecordingDeviceType.PC + + config = speech.types.RecognitionConfig( + encoding=speech.enums.RecognitionConfig.AudioEncoding.FLAC, + sample_rate_hertz=48000, + language_code='en-US', + metadata=metadata, + enable_automatic_punctuation=True, + enable_word_time_offsets=True, + ) - print('Waiting for operation to complete...') - response = operation.result(timeout=90) + print('Transcribing... {}'.format(gcs_uri)) + operation = client.long_running_recognize(config, audio) + operation.add_done_callback(lambda operation_future: save_results(operation_future.result().results, output)) + return operation +def save_results(results, output): # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. - for result in response.results: - # The first alternative is the most likely one for this portion. - print(u'Transcript: {}'.format(result.alternatives[0].transcript)) - print('Confidence: {}'.format(result.alternatives[0].confidence)) -# [END def_transcribe_gcs] + with open(output, 'w', newline='') as csvfile: + fieldnames = ['timestamp', 'confidence', 'transcript'] + csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames) + csvwriter.writeheader() + for result in results: + alternative = result.alternatives[0] + if len(alternative.words) > 0: + timestamp = alternative.words[0].start_time + timestamp = timestamp.seconds + 1e-9*timestamp.nanos + timestamp_hrs = int(timestamp // 3600) + timestamp_mins = int((timestamp - timestamp_hrs*3600) // 60) + timestamp_secs = int(timestamp - timestamp_mins * 60 - timestamp_hrs * 3600) + timestamp_str = '{:0>2d}:{:0>2d}:{:0>2d}'.format(timestamp_hrs, timestamp_mins, timestamp_secs) + else: + timestamp_str = '' + csvwriter.writerow({ + 'timestamp': timestamp_str, + 'confidence': '{:.2f}'.format(alternative.confidence), + 'transcript': alternative.transcript, + }) + print(u'{} | {:.2f} | {}'.format(timestamp_str, alternative.confidence, alternative.transcript)) if __name__ == '__main__': @@ -94,9 +135,15 @@ def transcribe_gcs(gcs_uri): description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( - 'path', help='File or GCS path for audio file to be recognized') + 'audio_file', help='File or GCS path for audio file to be transcribed') + parser.add_argument( + '--out', help='File to save the results (CSV)') args = parser.parse_args() - if args.path.startswith('gs://'): - transcribe_gcs(args.path) + if args.out is None: + args.out = os.path.splitext(args.audio_file)[0] + ".csv" + operation = None + if args.audio_file.startswith('gs://'): + operation = transcribe_gcs(args.audio_file, args.out) else: - transcribe_file(args.path) + operation = transcribe_file(args.audio_file, args.out) + operation.result()