[watsonstt] use next gen model (#12971)
Signed-off-by: Miguel Álvarez <miguelwork92@gmail.com>
This commit is contained in:
parent
a4f5308f91
commit
cf94687ad6
|
@ -24,6 +24,7 @@ Use your favorite configuration UI to edit **Settings / Other Services - IBM Wat
|
||||||
|
|
||||||
Use your favorite configuration UI to edit **Settings / Other Services - IBM Watson Speech-to-Text**:
|
Use your favorite configuration UI to edit **Settings / Other Services - IBM Watson Speech-to-Text**:
|
||||||
|
|
||||||
|
* **Prefer Multimedia Model** - Prefer multimedia to telephony [models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). Multimedia models are intended for audio that has a minimum sampling rate of 16 kHz, while telephony models are intended for audio that has a minimum sampling rate of 8 kHz.
|
||||||
* **Background Audio Suppression** - Use the parameter to suppress side conversations or background noise.
|
* **Background Audio Suppression** - Use the parameter to suppress side conversations or background noise.
|
||||||
* **Speech Detector Sensitivity** - Use the parameter to suppress word insertions from music, coughing, and other non-speech events.
|
* **Speech Detector Sensitivity** - Use the parameter to suppress word insertions from music, coughing, and other non-speech events.
|
||||||
* **Single Utterance Mode** - When enabled recognition stops listening after a single utterance.
|
* **Single Utterance Mode** - When enabled recognition stops listening after a single utterance.
|
||||||
|
|
|
@ -20,20 +20,20 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.ibm.watson</groupId>
|
<groupId>com.ibm.watson</groupId>
|
||||||
<artifactId>speech-to-text</artifactId>
|
<artifactId>speech-to-text</artifactId>
|
||||||
<version>9.3.1</version>
|
<version>10.0.1</version>
|
||||||
<scope>compile</scope>
|
<scope>compile</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<!-- sdk deps -->
|
<!-- sdk deps -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.ibm.cloud</groupId>
|
<groupId>com.ibm.cloud</groupId>
|
||||||
<artifactId>sdk-core</artifactId>
|
<artifactId>sdk-core</artifactId>
|
||||||
<version>9.15.4</version>
|
<version>9.15.5</version>
|
||||||
<scope>compile</scope>
|
<scope>compile</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.ibm.watson</groupId>
|
<groupId>com.ibm.watson</groupId>
|
||||||
<artifactId>common</artifactId>
|
<artifactId>common</artifactId>
|
||||||
<version>9.3.1</version>
|
<version>10.0.1</version>
|
||||||
<scope>compile</scope>
|
<scope>compile</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -51,7 +51,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.jetbrains.kotlin</groupId>
|
<groupId>org.jetbrains.kotlin</groupId>
|
||||||
<artifactId>kotlin-stdlib</artifactId>
|
<artifactId>kotlin-stdlib</artifactId>
|
||||||
<version>1.4.10</version>
|
<version>1.4.32</version>
|
||||||
<scope>compile</scope>
|
<scope>compile</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
|
|
|
@ -30,6 +30,11 @@ public class WatsonSTTConfiguration {
|
||||||
* Url for Speech-to-Text instance created on IBM Cloud.
|
* Url for Speech-to-Text instance created on IBM Cloud.
|
||||||
*/
|
*/
|
||||||
public String instanceUrl = "";
|
public String instanceUrl = "";
|
||||||
|
/**
|
||||||
|
* Prefer multimedia to telephony models. Multimedia models are intended for audio that has a minimum sampling rate
|
||||||
|
* of 16 kHz, while telephony models are intended for audio that has a minimum sampling rate of 8 kHz.
|
||||||
|
*/
|
||||||
|
public boolean preferMultimediaModel = true;
|
||||||
/**
|
/**
|
||||||
* Use the parameter to suppress side conversations or background noise.
|
* Use the parameter to suppress side conversations or background noise.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.util.concurrent.ScheduledExecutorService;
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import java.util.concurrent.atomic.AtomicReference;
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.eclipse.jdt.annotation.NonNullByDefault;
|
import org.eclipse.jdt.annotation.NonNullByDefault;
|
||||||
import org.eclipse.jdt.annotation.Nullable;
|
import org.eclipse.jdt.annotation.Nullable;
|
||||||
|
@ -69,13 +70,18 @@ import okhttp3.WebSocket;
|
||||||
public class WatsonSTTService implements STTService {
|
public class WatsonSTTService implements STTService {
|
||||||
private final Logger logger = LoggerFactory.getLogger(WatsonSTTService.class);
|
private final Logger logger = LoggerFactory.getLogger(WatsonSTTService.class);
|
||||||
private final ScheduledExecutorService executor = ThreadPoolManager.getScheduledPool("OH-voice-watsonstt");
|
private final ScheduledExecutorService executor = ThreadPoolManager.getScheduledPool("OH-voice-watsonstt");
|
||||||
private final List<String> models = List.of("ar-AR_BroadbandModel", "de-DE_BroadbandModel", "en-AU_BroadbandModel",
|
private final List<String> telephonyModels = List.of("ar-MS_Telephony", "zh-CN_Telephony", "nl-BE_Telephony",
|
||||||
"en-GB_BroadbandModel", "en-US_BroadbandModel", "es-AR_BroadbandModel", "es-CL_BroadbandModel",
|
"nl-NL_Telephony", "en-AU_Telephony", "en-IN_Telephony", "en-GB_Telephony", "en-US_Telephony",
|
||||||
"es-CO_BroadbandModel", "es-ES_BroadbandModel", "es-MX_BroadbandModel", "es-PE_BroadbandModel",
|
"fr-CA_Telephony", "fr-FR_Telephony", "hi-IN_Telephony", "pt-BR_Telephony", "es-ES_Telephony");
|
||||||
"fr-CA_BroadbandModel", "fr-FR_BroadbandModel", "it-IT_BroadbandModel", "ja-JP_BroadbandModel",
|
private final List<String> multimediaModels = List.of("en-AU_Multimedia", "en-GB_Multimedia", "en-US_Multimedia",
|
||||||
"ko-KR_BroadbandModel", "nl-NL_BroadbandModel", "pt-BR_BroadbandModel", "zh-CN_BroadbandModel");
|
"fr-FR_Multimedia", "de-DE_Multimedia", "it-IT_Multimedia", "ja-JP_Multimedia", "ko-KR_Multimedia",
|
||||||
private final Set<Locale> supportedLocales = models.stream().map(name -> name.split("_")[0])
|
"pt-BR_Multimedia", "es-ES_Multimedia");
|
||||||
.map(Locale::forLanguageTag).collect(Collectors.toSet());
|
// model 'en-WW_Medical_Telephony' and 'es-LA_Telephony' will be used as fallbacks for es and en
|
||||||
|
private final List<Locale> fallbackLocales = List.of(Locale.forLanguageTag("es"), Locale.ENGLISH);
|
||||||
|
private final Set<Locale> supportedLocales = Stream
|
||||||
|
.concat(Stream.concat(telephonyModels.stream(), multimediaModels.stream()).map(name -> name.split("_")[0])
|
||||||
|
.distinct().map(Locale::forLanguageTag), fallbackLocales.stream())
|
||||||
|
.collect(Collectors.toSet());
|
||||||
private WatsonSTTConfiguration config = new WatsonSTTConfiguration();
|
private WatsonSTTConfiguration config = new WatsonSTTConfiguration();
|
||||||
private @Nullable SpeechToText speechToText = null;
|
private @Nullable SpeechToText speechToText = null;
|
||||||
|
|
||||||
|
@ -134,7 +140,7 @@ public class WatsonSTTService implements STTService {
|
||||||
logger.debug("Content-Type: {}", contentType);
|
logger.debug("Content-Type: {}", contentType);
|
||||||
RecognizeWithWebsocketsOptions wsOptions = new RecognizeWithWebsocketsOptions.Builder().audio(audioStream)
|
RecognizeWithWebsocketsOptions wsOptions = new RecognizeWithWebsocketsOptions.Builder().audio(audioStream)
|
||||||
.contentType(contentType).redaction(config.redaction).smartFormatting(config.smartFormatting)
|
.contentType(contentType).redaction(config.redaction).smartFormatting(config.smartFormatting)
|
||||||
.model(locale.toLanguageTag() + "_BroadbandModel").interimResults(true)
|
.model(getModel(locale)).interimResults(true)
|
||||||
.backgroundAudioSuppression(config.backgroundAudioSuppression)
|
.backgroundAudioSuppression(config.backgroundAudioSuppression)
|
||||||
.speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.maxSilenceSeconds)
|
.speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.maxSilenceSeconds)
|
||||||
.build();
|
.build();
|
||||||
|
@ -157,6 +163,33 @@ public class WatsonSTTService implements STTService {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String getModel(Locale locale) throws STTException {
|
||||||
|
String languageTag = locale.toLanguageTag();
|
||||||
|
Stream<String> allModels;
|
||||||
|
if (config.preferMultimediaModel) {
|
||||||
|
allModels = Stream.concat(multimediaModels.stream(), telephonyModels.stream());
|
||||||
|
} else {
|
||||||
|
allModels = Stream.concat(telephonyModels.stream(), multimediaModels.stream());
|
||||||
|
}
|
||||||
|
var modelOption = allModels.filter(model -> model.startsWith(languageTag)).findFirst();
|
||||||
|
if (modelOption.isEmpty()) {
|
||||||
|
if ("es".equals(locale.getLanguage())) {
|
||||||
|
// fallback for latin american spanish languages
|
||||||
|
var model = "es-LA_Telephony";
|
||||||
|
logger.debug("Falling back to model: {}", model);
|
||||||
|
}
|
||||||
|
if ("en".equals(locale.getLanguage())) {
|
||||||
|
// fallback english dialects
|
||||||
|
var model = "en-WW_Medical_Telephony";
|
||||||
|
logger.debug("Falling back to model: {}", model);
|
||||||
|
}
|
||||||
|
throw new STTException("No compatible model for language " + languageTag);
|
||||||
|
}
|
||||||
|
var model = modelOption.get();
|
||||||
|
logger.debug("Using model: {}", model);
|
||||||
|
return model;
|
||||||
|
}
|
||||||
|
|
||||||
private @Nullable String getContentType(AudioStream audioStream) throws STTException {
|
private @Nullable String getContentType(AudioStream audioStream) throws STTException {
|
||||||
AudioFormat format = audioStream.getFormat();
|
AudioFormat format = audioStream.getFormat();
|
||||||
String container = format.getContainer();
|
String container = format.getContainer();
|
||||||
|
|
|
@ -22,6 +22,12 @@
|
||||||
<label>Instance Url</label>
|
<label>Instance Url</label>
|
||||||
<description>Url for Speech-to-Text instance created on IBM Cloud.</description>
|
<description>Url for Speech-to-Text instance created on IBM Cloud.</description>
|
||||||
</parameter>
|
</parameter>
|
||||||
|
<parameter name="preferMultimediaModel" type="boolean" groupName="stt">
|
||||||
|
<label>Prefer Multimedia Model</label>
|
||||||
|
<description>Prefer multimedia to telephony models. Multimedia models are intended for audio that has a minimum
|
||||||
|
sampling rate of 16 kHz, while telephony models are intended for audio that has a minimum sampling rate of 8 kHz.</description>
|
||||||
|
<default>true</default>
|
||||||
|
</parameter>
|
||||||
<parameter name="backgroundAudioSuppression" type="decimal" min="0" max="1" step="0.1" groupName="stt">
|
<parameter name="backgroundAudioSuppression" type="decimal" min="0" max="1" step="0.1" groupName="stt">
|
||||||
<label>Background Audio Suppression</label>
|
<label>Background Audio Suppression</label>
|
||||||
<description>Use the parameter to suppress side conversations or background noise.</description>
|
<description>Use the parameter to suppress side conversations or background noise.</description>
|
||||||
|
|
|
@ -14,6 +14,8 @@ voice.config.watsonstt.noResultsMessage.label = No Results Message
|
||||||
voice.config.watsonstt.noResultsMessage.description = Message to be told when no transcription is done.
|
voice.config.watsonstt.noResultsMessage.description = Message to be told when no transcription is done.
|
||||||
voice.config.watsonstt.optOutLogging.label = Opt Out Logging
|
voice.config.watsonstt.optOutLogging.label = Opt Out Logging
|
||||||
voice.config.watsonstt.optOutLogging.description = By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public.
|
voice.config.watsonstt.optOutLogging.description = By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public.
|
||||||
|
voice.config.watsonstt.preferMultimediaModel.label = Prefer Multimedia Model
|
||||||
|
voice.config.watsonstt.preferMultimediaModel.description = Prefer multimedia to telephony models. Multimedia models are intended for audio that has a minimum sampling rate of 16 kHz, while telephony models are intended for audio that has a minimum sampling rate of 8 kHz.
|
||||||
voice.config.watsonstt.redaction.label = Redaction
|
voice.config.watsonstt.redaction.label = Redaction
|
||||||
voice.config.watsonstt.redaction.description = If true, the service redacts, or masks, numeric data from final transcripts. (Not available for all locales)
|
voice.config.watsonstt.redaction.description = If true, the service redacts, or masks, numeric data from final transcripts. (Not available for all locales)
|
||||||
voice.config.watsonstt.singleUtteranceMode.label = Single Utterance Mode
|
voice.config.watsonstt.singleUtteranceMode.label = Single Utterance Mode
|
||||||
|
|
Loading…
Reference in New Issue