From a0a6a1450b99868053e846d30b90294cc599ea26 Mon Sep 17 00:00:00 2001
From: Benson Joeris <bjoeris@google.com>
Date: Thu, 17 May 2018 18:52:00 +0000
Subject: [PATCH 1/2] Added csv transcript

---
 speech/cloud-client/requirements.txt    |   1 +
 speech/cloud-client/transcribe_async.py | 120 ++++++++++++++----------
 2 files changed, 73 insertions(+), 48 deletions(-)

diff --git a/speech/cloud-client/requirements.txt b/speech/cloud-client/requirements.txt
index 87b74e0d733..476af52e39a 100644
--- a/speech/cloud-client/requirements.txt
+++ b/speech/cloud-client/requirements.txt
@@ -1 +1,2 @@
 google-cloud-speech==0.33.0
+google-cloud-storage==1.7.0
diff --git a/speech/cloud-client/transcribe_async.py b/speech/cloud-client/transcribe_async.py
index 0f1bb558535..48abeab0078 100644
--- a/speech/cloud-client/transcribe_async.py
+++ b/speech/cloud-client/transcribe_async.py
@@ -1,3 +1,4 @@
+
 #!/usr/bin/env python
 
 # Copyright 2017 Google Inc. All Rights Reserved.
@@ -23,70 +24,89 @@
 """
 
 import argparse
+import csv
+import datetime
 import io
+import os
+
+from google.cloud import speech_v1p1beta1 as speech
+from google.cloud import storage
 
+def _safe_filename(filename):
+        """
+        Generates a safe filename that is unlikely to collide with existing objects
+        in Google Cloud Storage.
+        ``filename.ext`` is transformed into ``filename-YYYY-MM-DD-HHMMSS.ext``
+        """
+        date = datetime.datetime.utcnow().strftime("%Y-%m-%d-%H%M%S")
+        basename, extension = filename.rsplit('.', 1)
+        return "{0}-{1}.{2}".format(basename, date, extension)
 
 # [START def_transcribe_file]
-def transcribe_file(speech_file):
+def transcribe_file(filename, output):
     """Transcribe the given audio file asynchronously."""
-    from google.cloud import speech
-    from google.cloud.speech import enums
-    from google.cloud.speech import types
-    client = speech.SpeechClient()
-
-    # [START migration_async_request]
-    with io.open(speech_file, 'rb') as audio_file:
-        content = audio_file.read()
-
-    audio = types.RecognitionAudio(content=content)
-    config = types.RecognitionConfig(
-        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
-        sample_rate_hertz=16000,
-        language_code='en-US')
-
-    # [START migration_async_response]
-    operation = client.long_running_recognize(config, audio)
-    # [END migration_async_request]
-
-    print('Waiting for operation to complete...')
-    response = operation.result(timeout=90)
-
-    # Each result is for a consecutive portion of the audio. Iterate through
-    # them to get the transcripts for the entire audio file.
-    for result in response.results:
-        # The first alternative is the most likely one for this portion.
-        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
-        print('Confidence: {}'.format(result.alternatives[0].confidence))
-    # [END migration_async_response]
+    client = storage.Client()
+
+    bucket_name = 'bjoeris-temp-audio'
+    bucket = client.bucket(bucket_name)
+    blob_name = _safe_filename(filename)
+    blob = bucket.blob(blob_name)
+    print("Uploading file...")
+    with io.open(filename, 'rb') as audio_file:
+        blob.upload_from_file(audio_file)
+    uri = "gs://{}/{}".format(bucket_name, blob_name)
+
+    transcribe_gcs(uri, output)
+    print("Deleting file...")
+    blob.delete()
 # [END def_transcribe_file]
 
 
 # [START def_transcribe_gcs]
-def transcribe_gcs(gcs_uri):
+def transcribe_gcs(gcs_uri, output):
     """Asynchronously transcribes the audio file specified by the gcs_uri."""
-    from google.cloud import speech
-    from google.cloud.speech import enums
-    from google.cloud.speech import types
     client = speech.SpeechClient()
 
-    audio = types.RecognitionAudio(uri=gcs_uri)
-    config = types.RecognitionConfig(
-        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
+    audio = speech.types.RecognitionAudio(uri=gcs_uri)
+
+    metadata = speech.types.RecognitionMetadata()
+    metadata.interaction_type = speech.enums.RecognitionMetadata.InteractionType.DISCUSSION
+    metadata.microphone_distance = speech.enums.RecognitionMetadata.MicrophoneDistance.NEARFIELD
+    metadata.recording_device_type = speech.enums.RecognitionMetadata.RecordingDeviceType.PC
+    config = speech.types.RecognitionConfig(
+        encoding=speech.enums.RecognitionConfig.AudioEncoding.FLAC,
         sample_rate_hertz=16000,
-        language_code='en-US')
+        language_code='en-US',
+        metadata=metadata,
+        enable_automatic_punctuation=True,
+        enable_word_time_offsets=True)
 
     operation = client.long_running_recognize(config, audio)
 
-    print('Waiting for operation to complete...')
+    print('Transcribing...')
     response = operation.result(timeout=90)
 
     # Each result is for a consecutive portion of the audio. Iterate through
     # them to get the transcripts for the entire audio file.
-    for result in response.results:
-        # The first alternative is the most likely one for this portion.
-        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
-        print('Confidence: {}'.format(result.alternatives[0].confidence))
-# [END def_transcribe_gcs]
+    timestamp = 0.0
+    with open(output, 'w', newline='') as csvfile:
+        fieldnames = ['timestamp', 'confidence', 'transcript']
+        csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        csvwriter.writeheader()
+        for result in response.results:
+            alternative = result.alternatives[0]
+            if len(alternative.words) > 0:
+                timestamp = alternative.words[0].start_time
+                timestamp = timestamp.seconds + 1e-9*timestamp.nanos
+                timestamp_mins = int(timestamp // 60)
+                timestamp_secs = timestamp - timestamp_mins * 60
+                csvwriter.writerow({
+                    'timestamp': '{}:{}'.format(timestamp_mins, timestamp_secs),
+                    'confidence': alternative.confidence,
+                    'transcript': alternative.transcript,
+                })
+                print(u'{}:{} | {} | {}'.format(timestamp_mins, timestamp_secs , alternative.confidence, alternative.transcript))
+# [END def_transcribe]
 
 
 if __name__ == '__main__':
@@ -94,9 +114,13 @@ def transcribe_gcs(gcs_uri):
         description=__doc__,
         formatter_class=argparse.RawDescriptionHelpFormatter)
     parser.add_argument(
-        'path', help='File or GCS path for audio file to be recognized')
+        'audio_file', help='File or GCS path for audio file to be transcribed')
+    parser.add_argument(
+        '--out', help='File to save the results (CSV)')
     args = parser.parse_args()
-    if args.path.startswith('gs://'):
-        transcribe_gcs(args.path)
+    if args.out is None:
+        args.out = os.path.splitext(args.audio_file)[0] + ".csv"
+    if args.audio_file.startswith('gs://'):
+        transcribe_gcs(args.audio_file, args.out)
     else:
-        transcribe_file(args.path)
+        transcribe_file(args.audio_file, args.out)

From e1089b9a22a181d920ef2547fb14189e1535a39d Mon Sep 17 00:00:00 2001
From: Benson Joeris <bjoeris@google.com>
Date: Sat, 26 May 2018 13:44:13 +0000
Subject: [PATCH 2/2] small update

---
 speech/cloud-client/requirements.txt    |  4 +-
 speech/cloud-client/transcribe_async.py | 75 ++++++++++++++++---------
 2 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/speech/cloud-client/requirements.txt b/speech/cloud-client/requirements.txt
index 476af52e39a..24207ec7414 100644
--- a/speech/cloud-client/requirements.txt
+++ b/speech/cloud-client/requirements.txt
@@ -1,2 +1,2 @@
-google-cloud-speech==0.33.0
-google-cloud-storage==1.7.0
+google-cloud-speech==0.34.0
+google-cloud-storage==1.10.0
diff --git a/speech/cloud-client/transcribe_async.py b/speech/cloud-client/transcribe_async.py
index 48abeab0078..339706d6525 100644
--- a/speech/cloud-client/transcribe_async.py
+++ b/speech/cloud-client/transcribe_async.py
@@ -28,10 +28,14 @@
 import datetime
 import io
 import os
+import subprocess
 
 from google.cloud import speech_v1p1beta1 as speech
+# from google.cloud import speech
 from google.cloud import storage
 
+UPLOAD_BUCKET_NAME = 'bjoeris-temp-audio'
+
 def _safe_filename(filename):
         """
         Generates a safe filename that is unlikely to collide with existing objects
@@ -39,7 +43,7 @@ def _safe_filename(filename):
         ``filename.ext`` is transformed into ``filename-YYYY-MM-DD-HHMMSS.ext``
         """
         date = datetime.datetime.utcnow().strftime("%Y-%m-%d-%H%M%S")
-        basename, extension = filename.rsplit('.', 1)
+        basename, extension = os.path.splitext(os.path.basename(filename))
         return "{0}-{1}.{2}".format(basename, date, extension)
 
 # [START def_transcribe_file]
@@ -47,22 +51,34 @@ def transcribe_file(filename, output):
     """Transcribe the given audio file asynchronously."""
     client = storage.Client()
 
-    bucket_name = 'bjoeris-temp-audio'
+    print("Converting file...")
+    filename = transcode_file(filename)
+
+    bucket_name = UPLOAD_BUCKET_NAME
     bucket = client.bucket(bucket_name)
     blob_name = _safe_filename(filename)
     blob = bucket.blob(blob_name)
-    print("Uploading file...")
+    uri = "gs://{}/{}".format(bucket_name, blob_name)
+    print("Uploading file...", uri)
     with io.open(filename, 'rb') as audio_file:
         blob.upload_from_file(audio_file)
-    uri = "gs://{}/{}".format(bucket_name, blob_name)
 
-    transcribe_gcs(uri, output)
-    print("Deleting file...")
-    blob.delete()
+    operation = transcribe_gcs(uri, output)
+    def callback(operation_future):
+        print("Deleting file...")
+        blob.delete()
+    operation.add_done_callback(callback)
+    return operation
 # [END def_transcribe_file]
 
+def transcode_file(filename):
+    stripped_name, ext = os.path.splitext(filename)
+    output = '{}-transcode.flac'.format(stripped_name)
+    subprocess.run(['ffmpeg', '-i', filename, '-ac', '1', '-ar', '48000', '-acodec', 'flac', output])
+    print("transcoded: ", output)
+    return output
+
 
-# [START def_transcribe_gcs]
 def transcribe_gcs(gcs_uri, output):
     """Asynchronously transcribes the audio file specified by the gcs_uri."""
     client = speech.SpeechClient()
@@ -73,40 +89,45 @@ def transcribe_gcs(gcs_uri, output):
     metadata.interaction_type = speech.enums.RecognitionMetadata.InteractionType.DISCUSSION
     metadata.microphone_distance = speech.enums.RecognitionMetadata.MicrophoneDistance.NEARFIELD
     metadata.recording_device_type = speech.enums.RecognitionMetadata.RecordingDeviceType.PC
+
     config = speech.types.RecognitionConfig(
         encoding=speech.enums.RecognitionConfig.AudioEncoding.FLAC,
-        sample_rate_hertz=16000,
+        sample_rate_hertz=48000,
         language_code='en-US',
         metadata=metadata,
         enable_automatic_punctuation=True,
-        enable_word_time_offsets=True)
+        enable_word_time_offsets=True,
+    )
 
+    print('Transcribing... {}'.format(gcs_uri))
     operation = client.long_running_recognize(config, audio)
+    operation.add_done_callback(lambda operation_future: save_results(operation_future.result().results, output))
+    return operation
 
-    print('Transcribing...')
-    response = operation.result(timeout=90)
-
+def save_results(results, output):
     # Each result is for a consecutive portion of the audio. Iterate through
     # them to get the transcripts for the entire audio file.
-    timestamp = 0.0
     with open(output, 'w', newline='') as csvfile:
         fieldnames = ['timestamp', 'confidence', 'transcript']
         csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
         csvwriter.writeheader()
-        for result in response.results:
+        for result in results:
             alternative = result.alternatives[0]
             if len(alternative.words) > 0:
                 timestamp = alternative.words[0].start_time
                 timestamp = timestamp.seconds + 1e-9*timestamp.nanos
-                timestamp_mins = int(timestamp // 60)
-                timestamp_secs = timestamp - timestamp_mins * 60
-                csvwriter.writerow({
-                    'timestamp': '{}:{}'.format(timestamp_mins, timestamp_secs),
-                    'confidence': alternative.confidence,
-                    'transcript': alternative.transcript,
-                })
-                print(u'{}:{} | {} | {}'.format(timestamp_mins, timestamp_secs , alternative.confidence, alternative.transcript))
-# [END def_transcribe]
+                timestamp_hrs = int(timestamp // 3600)
+                timestamp_mins = int((timestamp - timestamp_hrs*3600) // 60)
+                timestamp_secs = int(timestamp - timestamp_mins * 60 - timestamp_hrs * 3600)
+                timestamp_str = '{:0>2d}:{:0>2d}:{:0>2d}'.format(timestamp_hrs, timestamp_mins, timestamp_secs)
+            else:
+                timestamp_str = ''
+            csvwriter.writerow({
+                'timestamp': timestamp_str,
+                'confidence': '{:.2f}'.format(alternative.confidence),
+                'transcript': alternative.transcript,
+            })
+            print(u'{} | {:.2f} | {}'.format(timestamp_str, alternative.confidence, alternative.transcript))
 
 
 if __name__ == '__main__':
@@ -120,7 +141,9 @@ def transcribe_gcs(gcs_uri, output):
     args = parser.parse_args()
     if args.out is None:
         args.out = os.path.splitext(args.audio_file)[0] + ".csv"
+    operation = None
     if args.audio_file.startswith('gs://'):
-        transcribe_gcs(args.audio_file, args.out)
+        operation = transcribe_gcs(args.audio_file, args.out)
     else:
-        transcribe_file(args.audio_file, args.out)
+        operation = transcribe_file(args.audio_file, args.out)
+    operation.result()