From 480cddbf2c0775b5d8022aff96d0796bdc1039ae Mon Sep 17 00:00:00 2001 From: GiviMAD Date: Sat, 12 Mar 2022 23:06:51 +0100 Subject: [PATCH] STT service improvements (#12453) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [googlestt|voskstt] change default maxSilenceSeconds to 3 * [watsonstt] add singleUtterance mode, rename inativityTimeout to maxSilenceSeconds and minor improvements * [watsonstt] trim transcription Signed-off-by: Miguel Álvarez Díez --- .../internal/GoogleSTTConfiguration.java | 2 +- .../main/resources/OH-INF/config/config.xml | 2 +- .../internal/VoskSTTConfiguration.java | 2 +- .../main/resources/OH-INF/config/config.xml | 2 +- bundles/org.openhab.voice.watsonstt/README.md | 6 +- bundles/org.openhab.voice.watsonstt/pom.xml | 6 +- .../internal/WatsonSTTConfiguration.java | 8 ++- .../watsonstt/internal/WatsonSTTService.java | 55 ++++++++----------- .../main/resources/OH-INF/config/config.xml | 9 ++- .../OH-INF/i18n/watsonstt.properties | 6 +- 10 files changed, 51 insertions(+), 47 deletions(-) diff --git a/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConfiguration.java b/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConfiguration.java index a844bdb7b..4811dc1c7 100644 --- a/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConfiguration.java +++ b/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConfiguration.java @@ -49,7 +49,7 @@ public class GoogleSTTConfiguration { * Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop * listening. */ - public int maxSilenceSeconds = 5; + public int maxSilenceSeconds = 3; /** * Single phrase mode. */ diff --git a/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/config/config.xml b/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/config/config.xml index 171147383..58a0d9b0b 100644 --- a/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/config/config.xml +++ b/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/config/config.xml @@ -46,7 +46,7 @@ Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop listening. - 5 + 3 diff --git a/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConfiguration.java b/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConfiguration.java index 1f09cf98d..b4ebc9a73 100644 --- a/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConfiguration.java +++ b/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConfiguration.java @@ -33,7 +33,7 @@ public class VoskSTTConfiguration { * Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop * listening. */ - public int maxSilenceSeconds = 5; + public int maxSilenceSeconds = 3; /** * Message to be told when no results. */ diff --git a/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/config/config.xml b/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/config/config.xml index 627b4d697..1a6e37c1c 100644 --- a/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/config/config.xml +++ b/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/config/config.xml @@ -27,7 +27,7 @@ Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop listening. - 5 + 3 diff --git a/bundles/org.openhab.voice.watsonstt/README.md b/bundles/org.openhab.voice.watsonstt/README.md index adcfcf970..554829e57 100644 --- a/bundles/org.openhab.voice.watsonstt/README.md +++ b/bundles/org.openhab.voice.watsonstt/README.md @@ -26,7 +26,8 @@ Use your favorite configuration UI to edit **Settings / Other Services - IBM Wat * **Background Audio Suppression** - Use the parameter to suppress side conversations or background noise. * **Speech Detector Sensitivity** - Use the parameter to suppress word insertions from music, coughing, and other non-speech events. -* **Inactivity Timeout** - The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed. +* **Single Utterance Mode** - When enabled recognition stops listening after a single utterance. +* **Max Silence Seconds** - The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed. * **Opt Out Logging** - By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public. * **No Results Message** - Message to be told when no results. * **Smart Formatting** - If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable. (Not available for all locales) @@ -43,7 +44,8 @@ org.openhab.voice.watsonstt:apiKey=****** org.openhab.voice.watsonstt:instanceUrl=https://api.***.speech-to-text.watson.cloud.ibm.com/instances/***** org.openhab.voice.watsonstt:backgroundAudioSuppression=0.5 org.openhab.voice.watsonstt:speechDetectorSensitivity=0.5 -org.openhab.voice.watsonstt:inactivityTimeout=2 +org.openhab.voice.watsonstt:singleUtteranceMode=true +org.openhab.voice.watsonstt:maxSilenceSeconds=2 org.openhab.voice.watsonstt:optOutLogging=false org.openhab.voice.watsonstt:smartFormatting=false org.openhab.voice.watsonstt:redaction=false diff --git a/bundles/org.openhab.voice.watsonstt/pom.xml b/bundles/org.openhab.voice.watsonstt/pom.xml index 30ad6db8b..1e0c4d153 100644 --- a/bundles/org.openhab.voice.watsonstt/pom.xml +++ b/bundles/org.openhab.voice.watsonstt/pom.xml @@ -27,7 +27,7 @@ com.ibm.cloud sdk-core - 9.15.0 + 9.15.4 compile @@ -39,13 +39,13 @@ com.squareup.okhttp3 okhttp - 4.9.1 + 4.9.3 compile com.squareup.okhttp3 okhttp-urlconnection - 4.9.1 + 4.9.3 compile diff --git a/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java b/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java index b7785541e..5358d92d7 100644 --- a/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java +++ b/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java @@ -48,9 +48,13 @@ public class WatsonSTTConfiguration { */ public boolean redaction = false; /** - * The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed. + * Single phrase mode. */ - public int inactivityTimeout = 3; + public boolean singleUtteranceMode = true; + /** + * max seconds without getting new transcriptions to stop listening. + */ + public int maxSilenceSeconds = 3; /** * Message to be told when no results */ diff --git a/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java b/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java index ebd5c0759..311ebb769 100644 --- a/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java +++ b/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java @@ -23,8 +23,6 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; -import javax.net.ssl.SSLPeerUnverifiedException; - import org.eclipse.jdt.annotation.NonNullByDefault; import org.eclipse.jdt.annotation.Nullable; import org.openhab.core.audio.AudioFormat; @@ -47,6 +45,7 @@ import org.osgi.service.component.annotations.Modified; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.gson.JsonObject; import com.ibm.cloud.sdk.core.http.HttpMediaType; import com.ibm.cloud.sdk.core.security.IamAuthenticator; import com.ibm.watson.speech_to_text.v1.SpeechToText; @@ -130,31 +129,13 @@ public class WatsonSTTService implements STTService { .contentType(contentType).redaction(config.redaction).smartFormatting(config.smartFormatting) .model(locale.toLanguageTag() + "_BroadbandModel").interimResults(true) .backgroundAudioSuppression(config.backgroundAudioSuppression) - .speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.inactivityTimeout) + .speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.maxSilenceSeconds) .build(); final AtomicReference<@Nullable WebSocket> socketRef = new AtomicReference<>(); final AtomicBoolean aborted = new AtomicBoolean(false); executor.submit(() -> { - int retries = 2; - while (retries > 0) { - try { - socketRef.set(speechToText.recognizeUsingWebSocket(wsOptions, - new TranscriptionListener(sttListener, config, aborted))); - break; - } catch (RuntimeException e) { - var cause = e.getCause(); - if (cause instanceof SSLPeerUnverifiedException) { - logger.debug("Retrying on error: {}", cause.getMessage()); - retries--; - } else { - var errorMessage = e.getMessage(); - logger.warn("Aborting on error: {}", errorMessage); - sttListener.sttEventReceived( - new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error")); - break; - } - } - } + socketRef.set(speechToText.recognizeUsingWebSocket(wsOptions, + new TranscriptionListener(socketRef, sttListener, config, aborted))); }); return new STTServiceHandle() { @Override @@ -162,12 +143,7 @@ public class WatsonSTTService implements STTService { if (!aborted.getAndSet(true)) { var socket = socketRef.get(); if (socket != null) { - socket.close(1000, null); - socket.cancel(); - try { - Thread.sleep(100); - } catch (InterruptedException ignored) { - } + sendStopMessage(socket); } } } @@ -224,17 +200,26 @@ public class WatsonSTTService implements STTService { return null; } + private static void sendStopMessage(WebSocket ws) { + JsonObject stopMessage = new JsonObject(); + stopMessage.addProperty("action", "stop"); + ws.send(stopMessage.toString()); + } + private static class TranscriptionListener implements RecognizeCallback { private final Logger logger = LoggerFactory.getLogger(TranscriptionListener.class); private final StringBuilder transcriptBuilder = new StringBuilder(); private final STTListener sttListener; private final WatsonSTTConfiguration config; private final AtomicBoolean aborted; + private final AtomicReference<@Nullable WebSocket> socketRef; private float confidenceSum = 0f; private int responseCount = 0; private boolean disconnected = false; - public TranscriptionListener(STTListener sttListener, WatsonSTTConfiguration config, AtomicBoolean aborted) { + public TranscriptionListener(AtomicReference<@Nullable WebSocket> socketRef, STTListener sttListener, + WatsonSTTConfiguration config, AtomicBoolean aborted) { + this.socketRef = socketRef; this.sttListener = sttListener; this.config = config; this.aborted = aborted; @@ -256,6 +241,12 @@ public class WatsonSTTService implements STTService { transcriptBuilder.append(alternative.getTranscript()); confidenceSum += confidence != null ? confidence.floatValue() : 0f; responseCount++; + if (config.singleUtteranceMode) { + var socket = socketRef.get(); + if (socket != null) { + sendStopMessage(socket); + } + } }); } @@ -272,7 +263,7 @@ public class WatsonSTTService implements STTService { return; } logger.warn("TranscriptionError: {}", errorMessage); - if (!aborted.get()) { + if (!aborted.getAndSet(true)) { sttListener.sttEventReceived( new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error")); } @@ -285,7 +276,7 @@ public class WatsonSTTService implements STTService { if (!aborted.getAndSet(true)) { sttListener.sttEventReceived(new RecognitionStopEvent()); float averageConfidence = confidenceSum / (float) responseCount; - String transcript = transcriptBuilder.toString(); + String transcript = transcriptBuilder.toString().trim(); if (!transcript.isBlank()) { sttListener.sttEventReceived(new SpeechRecognitionEvent(transcript, averageConfidence)); } else { diff --git a/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml b/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml index 4dbd2a281..3be580499 100644 --- a/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml +++ b/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml @@ -32,8 +32,8 @@ Use the parameter to suppress word insertions from music, coughing, and other non-speech events. 0.5 - - + + The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed. 3 @@ -43,6 +43,11 @@ Message to be told when no transcription is done. No results + + + When enabled recognition stops listening after a single utterance. + true + By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the diff --git a/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties b/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties index eebbd4792..29d5c4056 100644 --- a/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties +++ b/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties @@ -6,16 +6,18 @@ voice.config.watsonstt.group.authentication.label = Authentication voice.config.watsonstt.group.authentication.description = Information for connection to your Watson Speech-to-Text instance. voice.config.watsonstt.group.stt.label = STT Configuration voice.config.watsonstt.group.stt.description = Parameters for Watson Speech-to-Text API. -voice.config.watsonstt.inactivityTimeout.label = Inactivity Timeout -voice.config.watsonstt.inactivityTimeout.description = The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed. voice.config.watsonstt.instanceUrl.label = Instance Url voice.config.watsonstt.instanceUrl.description = Url for Speech-to-Text instance created on IBM Cloud. +voice.config.watsonstt.maxSilenceSeconds.label = Max Silence Seconds +voice.config.watsonstt.maxSilenceSeconds.description = The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed. voice.config.watsonstt.noResultsMessage.label = No Results Message voice.config.watsonstt.noResultsMessage.description = Message to be told when no transcription is done. voice.config.watsonstt.optOutLogging.label = Opt Out Logging voice.config.watsonstt.optOutLogging.description = By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public. voice.config.watsonstt.redaction.label = Redaction voice.config.watsonstt.redaction.description = If true, the service redacts, or masks, numeric data from final transcripts. (Not available for all locales) +voice.config.watsonstt.singleUtteranceMode.label = Single Utterance Mode +voice.config.watsonstt.singleUtteranceMode.description = When enabled recognition stops listening after a single utterance. voice.config.watsonstt.smartFormatting.label = Smart Formatting voice.config.watsonstt.smartFormatting.description = If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable. (Not available for all locales) voice.config.watsonstt.speechDetectorSensitivity.label = Speech Detector Sensitivity