From cf94687ad67f0bd07a06d762cd1268c76c179eee Mon Sep 17 00:00:00 2001 From: GiviMAD Date: Wed, 22 Jun 2022 08:54:13 +0200 Subject: [PATCH] [watsonstt] use next gen model (#12971) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Miguel Álvarez --- bundles/org.openhab.voice.watsonstt/README.md | 1 + bundles/org.openhab.voice.watsonstt/pom.xml | 8 +-- .../internal/WatsonSTTConfiguration.java | 5 ++ .../watsonstt/internal/WatsonSTTService.java | 49 ++++++++++++++++--- .../main/resources/OH-INF/config/config.xml | 6 +++ .../OH-INF/i18n/watsonstt.properties | 2 + 6 files changed, 59 insertions(+), 12 deletions(-) diff --git a/bundles/org.openhab.voice.watsonstt/README.md b/bundles/org.openhab.voice.watsonstt/README.md index 554829e57..46ba04c50 100644 --- a/bundles/org.openhab.voice.watsonstt/README.md +++ b/bundles/org.openhab.voice.watsonstt/README.md @@ -24,6 +24,7 @@ Use your favorite configuration UI to edit **Settings / Other Services - IBM Wat Use your favorite configuration UI to edit **Settings / Other Services - IBM Watson Speech-to-Text**: +* **Prefer Multimedia Model** - Prefer multimedia to telephony [models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). Multimedia models are intended for audio that has a minimum sampling rate of 16 kHz, while telephony models are intended for audio that has a minimum sampling rate of 8 kHz. * **Background Audio Suppression** - Use the parameter to suppress side conversations or background noise. * **Speech Detector Sensitivity** - Use the parameter to suppress word insertions from music, coughing, and other non-speech events. * **Single Utterance Mode** - When enabled recognition stops listening after a single utterance. diff --git a/bundles/org.openhab.voice.watsonstt/pom.xml b/bundles/org.openhab.voice.watsonstt/pom.xml index 1e0c4d153..45cbf5d02 100644 --- a/bundles/org.openhab.voice.watsonstt/pom.xml +++ b/bundles/org.openhab.voice.watsonstt/pom.xml @@ -20,20 +20,20 @@ com.ibm.watson speech-to-text - 9.3.1 + 10.0.1 compile com.ibm.cloud sdk-core - 9.15.4 + 9.15.5 compile com.ibm.watson common - 9.3.1 + 10.0.1 compile @@ -51,7 +51,7 @@ org.jetbrains.kotlin kotlin-stdlib - 1.4.10 + 1.4.32 compile diff --git a/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java b/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java index 5358d92d7..0dd2b4f93 100644 --- a/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java +++ b/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java @@ -30,6 +30,11 @@ public class WatsonSTTConfiguration { * Url for Speech-to-Text instance created on IBM Cloud. */ public String instanceUrl = ""; + /** + * Prefer multimedia to telephony models. Multimedia models are intended for audio that has a minimum sampling rate + * of 16 kHz, while telephony models are intended for audio that has a minimum sampling rate of 8 kHz. + */ + public boolean preferMultimediaModel = true; /** * Use the parameter to suppress side conversations or background noise. */ diff --git a/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java b/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java index 9454a778a..5a90252df 100644 --- a/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java +++ b/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java @@ -22,6 +22,7 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.eclipse.jdt.annotation.NonNullByDefault; import org.eclipse.jdt.annotation.Nullable; @@ -69,13 +70,18 @@ import okhttp3.WebSocket; public class WatsonSTTService implements STTService { private final Logger logger = LoggerFactory.getLogger(WatsonSTTService.class); private final ScheduledExecutorService executor = ThreadPoolManager.getScheduledPool("OH-voice-watsonstt"); - private final List models = List.of("ar-AR_BroadbandModel", "de-DE_BroadbandModel", "en-AU_BroadbandModel", - "en-GB_BroadbandModel", "en-US_BroadbandModel", "es-AR_BroadbandModel", "es-CL_BroadbandModel", - "es-CO_BroadbandModel", "es-ES_BroadbandModel", "es-MX_BroadbandModel", "es-PE_BroadbandModel", - "fr-CA_BroadbandModel", "fr-FR_BroadbandModel", "it-IT_BroadbandModel", "ja-JP_BroadbandModel", - "ko-KR_BroadbandModel", "nl-NL_BroadbandModel", "pt-BR_BroadbandModel", "zh-CN_BroadbandModel"); - private final Set supportedLocales = models.stream().map(name -> name.split("_")[0]) - .map(Locale::forLanguageTag).collect(Collectors.toSet()); + private final List telephonyModels = List.of("ar-MS_Telephony", "zh-CN_Telephony", "nl-BE_Telephony", + "nl-NL_Telephony", "en-AU_Telephony", "en-IN_Telephony", "en-GB_Telephony", "en-US_Telephony", + "fr-CA_Telephony", "fr-FR_Telephony", "hi-IN_Telephony", "pt-BR_Telephony", "es-ES_Telephony"); + private final List multimediaModels = List.of("en-AU_Multimedia", "en-GB_Multimedia", "en-US_Multimedia", + "fr-FR_Multimedia", "de-DE_Multimedia", "it-IT_Multimedia", "ja-JP_Multimedia", "ko-KR_Multimedia", + "pt-BR_Multimedia", "es-ES_Multimedia"); + // model 'en-WW_Medical_Telephony' and 'es-LA_Telephony' will be used as fallbacks for es and en + private final List fallbackLocales = List.of(Locale.forLanguageTag("es"), Locale.ENGLISH); + private final Set supportedLocales = Stream + .concat(Stream.concat(telephonyModels.stream(), multimediaModels.stream()).map(name -> name.split("_")[0]) + .distinct().map(Locale::forLanguageTag), fallbackLocales.stream()) + .collect(Collectors.toSet()); private WatsonSTTConfiguration config = new WatsonSTTConfiguration(); private @Nullable SpeechToText speechToText = null; @@ -134,7 +140,7 @@ public class WatsonSTTService implements STTService { logger.debug("Content-Type: {}", contentType); RecognizeWithWebsocketsOptions wsOptions = new RecognizeWithWebsocketsOptions.Builder().audio(audioStream) .contentType(contentType).redaction(config.redaction).smartFormatting(config.smartFormatting) - .model(locale.toLanguageTag() + "_BroadbandModel").interimResults(true) + .model(getModel(locale)).interimResults(true) .backgroundAudioSuppression(config.backgroundAudioSuppression) .speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.maxSilenceSeconds) .build(); @@ -157,6 +163,33 @@ public class WatsonSTTService implements STTService { }; } + private String getModel(Locale locale) throws STTException { + String languageTag = locale.toLanguageTag(); + Stream allModels; + if (config.preferMultimediaModel) { + allModels = Stream.concat(multimediaModels.stream(), telephonyModels.stream()); + } else { + allModels = Stream.concat(telephonyModels.stream(), multimediaModels.stream()); + } + var modelOption = allModels.filter(model -> model.startsWith(languageTag)).findFirst(); + if (modelOption.isEmpty()) { + if ("es".equals(locale.getLanguage())) { + // fallback for latin american spanish languages + var model = "es-LA_Telephony"; + logger.debug("Falling back to model: {}", model); + } + if ("en".equals(locale.getLanguage())) { + // fallback english dialects + var model = "en-WW_Medical_Telephony"; + logger.debug("Falling back to model: {}", model); + } + throw new STTException("No compatible model for language " + languageTag); + } + var model = modelOption.get(); + logger.debug("Using model: {}", model); + return model; + } + private @Nullable String getContentType(AudioStream audioStream) throws STTException { AudioFormat format = audioStream.getFormat(); String container = format.getContainer(); diff --git a/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml b/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml index 3be580499..ed54844ae 100644 --- a/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml +++ b/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml @@ -22,6 +22,12 @@ Url for Speech-to-Text instance created on IBM Cloud. + + + Prefer multimedia to telephony models. Multimedia models are intended for audio that has a minimum + sampling rate of 16 kHz, while telephony models are intended for audio that has a minimum sampling rate of 8 kHz. + true + Use the parameter to suppress side conversations or background noise. diff --git a/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties b/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties index 29d5c4056..6ca306aac 100644 --- a/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties +++ b/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties @@ -14,6 +14,8 @@ voice.config.watsonstt.noResultsMessage.label = No Results Message voice.config.watsonstt.noResultsMessage.description = Message to be told when no transcription is done. voice.config.watsonstt.optOutLogging.label = Opt Out Logging voice.config.watsonstt.optOutLogging.description = By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public. +voice.config.watsonstt.preferMultimediaModel.label = Prefer Multimedia Model +voice.config.watsonstt.preferMultimediaModel.description = Prefer multimedia to telephony models. Multimedia models are intended for audio that has a minimum sampling rate of 16 kHz, while telephony models are intended for audio that has a minimum sampling rate of 8 kHz. voice.config.watsonstt.redaction.label = Redaction voice.config.watsonstt.redaction.description = If true, the service redacts, or masks, numeric data from final transcripts. (Not available for all locales) voice.config.watsonstt.singleUtteranceMode.label = Single Utterance Mode