diff --git a/CODEOWNERS b/CODEOWNERS index 02ef5fa36..e15f903cd 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -374,6 +374,7 @@ /bundles/org.openhab.transform.scale/ @clinique /bundles/org.openhab.transform.xpath/ @openhab/add-ons-maintainers /bundles/org.openhab.transform.xslt/ @openhab/add-ons-maintainers +/bundles/org.openhab.voice.googlestt/ @GiviMAD /bundles/org.openhab.voice.googletts/ @gbicskei /bundles/org.openhab.voice.mactts/ @kaikreuzer /bundles/org.openhab.voice.marytts/ @kaikreuzer diff --git a/bom/openhab-addons/pom.xml b/bom/openhab-addons/pom.xml index 56ebd6d9b..639dcf9c3 100644 --- a/bom/openhab-addons/pom.xml +++ b/bom/openhab-addons/pom.xml @@ -1861,6 +1861,11 @@ org.openhab.transform.xslt ${project.version} + + org.openhab.addons.bundles + org.openhab.voice.googlestt + ${project.version} + org.openhab.addons.bundles org.openhab.voice.googletts diff --git a/bundles/org.openhab.voice.googlestt/NOTICE b/bundles/org.openhab.voice.googlestt/NOTICE new file mode 100644 index 000000000..38d625e34 --- /dev/null +++ b/bundles/org.openhab.voice.googlestt/NOTICE @@ -0,0 +1,13 @@ +This content is produced and maintained by the openHAB project. + +* Project home: https://www.openhab.org + +== Declared Project Licenses + +This program and the accompanying materials are made available under the terms +of the Eclipse Public License 2.0 which is available at +https://www.eclipse.org/legal/epl-2.0/. + +== Source Code + +https://github.com/openhab/openhab-addons diff --git a/bundles/org.openhab.voice.googlestt/README.md b/bundles/org.openhab.voice.googlestt/README.md new file mode 100644 index 000000000..6868ffe4c --- /dev/null +++ b/bundles/org.openhab.voice.googlestt/README.md @@ -0,0 +1,62 @@ +# Google Cloud Speech-to-Text + +Google Cloud STT Service uses the non-free Google Cloud Speech-to-Text API to transcript audio data to text. +Be aware, that using this service may incur cost on your Google Cloud account. +You can find pricing information on the [documentation page](https://cloud.google.com/speech-to-text#section-12). + +## Obtaining Credentials + +Before you can integrate this service with your Google Cloud Speech-to-Text, you must have a Google API Console project: + +* Select or create a GCP project. [link](https://console.cloud.google.com/cloud-resource-manager) +* Make sure that billing is enabled for your project. [link](https://cloud.google.com/billing/docs/how-to/modify-project) +* Enable the Cloud Speech-to-Text API. [link](https://console.cloud.google.com/apis/dashboard) +* Set up authentication: + * Go to the "APIs & Services" -> "Credentials" page in the GCP Console and your project. [link](https://console.cloud.google.com/apis/credentials) + * From the "Create credentials" drop-down list, select "OAuth client ID. + * Select application type "TV and Limited Input" and enter a name into the "Name" field. + * Click Create. A pop-up appears, showing your "client ID" and "client secret". + +## Configuration + +### Authentication Configuration + +Using your favorite configuration UI to edit **Settings / Other Services - Google Cloud Speech-to-Text** and set: + +* **Client Id** - Google Cloud Platform OAuth 2.0-Client Id. +* **Client Secret** - Google Cloud Platform OAuth 2.0-Client Secret. +* **Oauth Code** - The oauth code is a one-time code needed to retrieve the necessary access-codes from Google Cloud Platform.**Please go to your browser ...**[https://accounts.google.com/o/oauth2/auth?client_id=&redirect_uri=urn:ietf:wg:oauth:2.0:oob&scope=https://www.googleapis.com/auth/cloud-platform&response_type=code](https://accounts.google.com/o/oauth2/auth?client_id=&redirect_uri=urn:ietf:wg:oauth:2.0:oob&scope=https://www.googleapis.com/auth/cloud-platform&response_type=code) (replace `` by your Client Id)**... to generate an auth-code and paste it here**. After initial authorization, this code is not needed anymore. + +### Speech to Text Configuration + +Using your favorite configuration UI to edit **Settings / Other Services - Google Cloud Speech-to-Text**: + +* **Single Utterance Mode** - When enabled Google Cloud Platform is responsible for detecting when to stop listening after a single utterance. (Recommended) +* **Max Transcription Seconds** - Max seconds to wait to force stop the transcription. +* **Max Silence Seconds** - Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop listening. +* **Refresh Supported Locales** - Try loading supported locales from the documentation page. + +### Messages Configuration + +Using your favorite configuration UI to edit **Settings / Other Services - Google Cloud Speech-to-Text**: + +* **No Results Message** - Message to be told when no results. (Empty for disabled) +* **Error Message** - Message to be told when an error has happened. (Empty for disabled) + +### Configuration via a text file + +In case you would like to setup the service via a text file, create a new file in `$OPENHAB_ROOT/conf/services` named `googlestt.cfg` + +Its contents should look similar to: + +``` +org.openhab.voice.googlestt:clientId=ID +org.openhab.voice.googlestt:clientSecret=SECRET +org.openhab.voice.googlestt:authcode=XXXXX +org.openhab.voice.googlestt:singleUtteranceMode=true +org.openhab.voice.googlestt:maxTranscriptionSeconds=60 +org.openhab.voice.googlestt:maxSilenceSeconds=5 +org.openhab.voice.googlestt:refreshSupportedLocales=false +org.openhab.voice.googlestt:noResultsMessage="Sorry, I didn't understand you" +org.openhab.voice.googlestt:errorMessage="Sorry, something went wrong" +``` diff --git a/bundles/org.openhab.voice.googlestt/pom.xml b/bundles/org.openhab.voice.googlestt/pom.xml new file mode 100644 index 000000000..07c8afa3f --- /dev/null +++ b/bundles/org.openhab.voice.googlestt/pom.xml @@ -0,0 +1,161 @@ + + + + 4.0.0 + + + org.openhab.addons.bundles + org.openhab.addons.reactor.bundles + 3.3.0-SNAPSHOT + + + org.openhab.voice.googlestt + + openHAB Add-ons :: Bundles :: Voice :: Google Cloud Speech to Text + + !*opencensus*,!org.bouncycastle*,!*jboss*,!javax.annotation.*,!net.jpountz.*,!lzma.sdk.*,org.eclipse.jetty.*;resolution:=optional,com.ning.*;resolution:=optional,com.jcraft.*;resolution:=optional,com.google.re2j.*;resolution:=optional,com.google.api.client.*;resolution:=optional,org.conscrypt.*;resolution:=optional,!io.grpc.census.*,com.sun.jndi.dns.*;resolution:=optional,org.apache.log.*;resolution:=optional,org.apache.http.*;resolution:=optional,sun.security.*;resolution:=optional,com.oracle.svm.core.annotate.*;resolution:=optional,*blockhound*;resolution:=optional,com.google.protobuf.nano.*;resolution:=optional,io.grpc.*;resolution:=optional,com.google.protobuf.*;resolution:=optional,io.perfmark.*;resolution:=optional + + + + com.google.cloud + google-cloud-speech + 2.2.2 + compile + + + + com.google.api.grpc + proto-google-common-protos + 2.7.1 + compile + + + com.google.http-client + google-http-client + 1.40.1 + compile + + + com.google.auth + google-auth-library-credentials + 1.2.1 + compile + + + com.google.auth + google-auth-library-oauth2-http + 1.3.0 + compile + + + com.google.api.grpc + proto-google-cloud-speech-v1 + 2.2.2 + compile + + + com.google.protobuf + protobuf-java + 3.19.2 + compile + + + com.google.api + gax + 2.8.1 + compile + + + com.google.api + gax-grpc + 2.8.1 + compile + + + com.google.api + api-common + 2.1.2 + compile + + + org.threeten + threetenbp + 1.5.2 + compile + + + io.perfmark + perfmark-api + 0.23.0 + compile + + + io.grpc + grpc-api + 1.43.2 + compile + + + io.grpc + grpc-protobuf + 1.42.1 + compile + + + io.grpc + grpc-protobuf-lite + 1.42.1 + compile + + + io.grpc + grpc-alts + 1.43.2 + compile + + + io.grpc + grpc-grpclb + 1.43.2 + compile + + + io.grpc + grpc-auth + 1.43.2 + compile + + + io.grpc + grpc-core + 1.43.2 + compile + + + io.grpc + grpc-context + 1.43.2 + compile + + + io.grpc + grpc-netty-shaded + 1.43.2 + compile + + + io.grpc + grpc-xds + 1.43.2 + compile + + + io.grpc + grpc-services + 1.43.2 + compile + + + + diff --git a/bundles/org.openhab.voice.googlestt/src/main/feature/feature.xml b/bundles/org.openhab.voice.googlestt/src/main/feature/feature.xml new file mode 100644 index 000000000..acf5ea0b8 --- /dev/null +++ b/bundles/org.openhab.voice.googlestt/src/main/feature/feature.xml @@ -0,0 +1,9 @@ + + + mvn:org.openhab.core.features.karaf/org.openhab.core.features.karaf.openhab-core/${ohc.version}/xml/features + + + openhab-runtime-base + mvn:org.openhab.addons.bundles/org.openhab.voice.googlestt/${project.version} + + diff --git a/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConfiguration.java b/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConfiguration.java new file mode 100644 index 000000000..a844bdb7b --- /dev/null +++ b/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConfiguration.java @@ -0,0 +1,61 @@ +/** + * Copyright (c) 2010-2022 Contributors to the openHAB project + * + * See the NOTICE file(s) distributed with this work for additional + * information. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0 + * + * SPDX-License-Identifier: EPL-2.0 + */ +package org.openhab.voice.googlestt.internal; + +import org.eclipse.jdt.annotation.NonNullByDefault; + +/** + * The {@link GoogleSTTConfiguration} class contains fields mapping thing configuration parameters. + * + * @author Miguel Álvarez - Initial contribution + */ +@NonNullByDefault +public class GoogleSTTConfiguration { + /** + * Google Cloud Client ID, needs Speech To Text API enabled + */ + public String clientId = ""; + /** + * Google Cloud Client Secret + */ + public String clientSecret = ""; + /** + * Code for obtain oauth access token + */ + public String oauthCode = ""; + /** + * Message to be told when no results. + */ + public String noResultsMessage = ""; + /** + * Message to be told when an error has happened. + */ + public String errorMessage = ""; + /** + * Max seconds to wait to force stop the transcription. + */ + public int maxTranscriptionSeconds = 60; + /** + * Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop + * listening. + */ + public int maxSilenceSeconds = 5; + /** + * Single phrase mode. + */ + public boolean singleUtteranceMode = true; + /** + * Try loading supported locales from the documentation page. + */ + public boolean refreshSupportedLocales = false; +} diff --git a/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConstants.java b/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConstants.java new file mode 100644 index 000000000..aa9dd6e54 --- /dev/null +++ b/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConstants.java @@ -0,0 +1,43 @@ +/** + * Copyright (c) 2010-2022 Contributors to the openHAB project + * + * See the NOTICE file(s) distributed with this work for additional + * information. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0 + * + * SPDX-License-Identifier: EPL-2.0 + */ +package org.openhab.voice.googlestt.internal; + +import org.eclipse.jdt.annotation.NonNullByDefault; + +/** + * The {@link GoogleSTTConstants} class defines common constants, which are + * used across the whole binding. + * + * @author Miguel Álvarez - Initial contribution + */ +@NonNullByDefault +public class GoogleSTTConstants { + /** + * Service name + */ + public static final String SERVICE_NAME = "Google Cloud Speech-to-Text"; + /** + * Service id + */ + public static final String SERVICE_ID = "googlestt"; + + /** + * Service category + */ + public static final String SERVICE_CATEGORY = "voice"; + + /** + * Service pid + */ + public static final String SERVICE_PID = "org.openhab." + SERVICE_CATEGORY + "." + SERVICE_ID; +} diff --git a/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTLocale.java b/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTLocale.java new file mode 100644 index 000000000..67ef9d97c --- /dev/null +++ b/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTLocale.java @@ -0,0 +1,92 @@ +/** + * Copyright (c) 2010-2022 Contributors to the openHAB project + * + * See the NOTICE file(s) distributed with this work for additional + * information. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0 + * + * SPDX-License-Identifier: EPL-2.0 + */ +package org.openhab.voice.googlestt.internal; + +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.eclipse.jdt.annotation.NonNullByDefault; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * The {@link GoogleSTTLocale} is responsible for loading supported locales for the Google Cloud Speech-to-Text service. + * + * @author Miguel Álvarez - Initial contribution + */ +@NonNullByDefault +public class GoogleSTTLocale { + private static final Set SUPPORTED_LOCALES = new HashSet<>(); + private static final String GC_STT_DOC_LANGUAGES = "https://cloud.google.com/speech-to-text/docs/languages"; + private static final String LOCAL_COPY = "af-ZA,sq-AL,am-ET,ar-DZ,ar-BH,ar-EG,ar-IQ,ar-IL,ar-JO,ar-KW,ar-LB,ar-MA,ar-OM,ar-QA,ar-SA,ar-PS,ar-TN,ar-AE,ar-YE,hy-AM,az-AZ,eu-ES,bn-BD,bn-IN,bs-BA,bg-BG,my-MM,ca-ES,hr-HR,cs-CZ,da-DK,nl-BE,nl-NL,en-AU,en-CA,en-GH,en-HK,en-IN,en-IE,en-KE,en-NZ,en-NG,en-PK,en-PH,en-SG,en-ZA,en-TZ,en-GB,en-US,et-EE,fi-FI,fr-BE,fr-CA,fr-FR,fr-CH,gl-ES,ka-GE,de-AT,de-DE,de-CH,el-GR,gu-IN,he-IL,hi-IN,hu-HU,is-IS,id-ID,it-IT,it-CH,ja-JP,jv-ID,kn-IN,kk-KZ,km-KH,ko-KR,lo-LA,lv-LV,lt-LT,mk-MK,ms-MY,ml-IN,mr-IN,mn-MN,ne-NP,no-NO,fa-IR,pl-PL,pt-BR,pt-PT,ro-RO,ru-RU,sr-RS,si-LK,sk-SK,sl-SI,es-AR,es-BO,es-CL,es-CO,es-CR,es-DO,es-EC,es-SV,es-GT,es-HN,es-MX,es-NI,es-PA,es-PY,es-PE,es-PR,es-ES,es-US,es-UY,es-VE,su-ID,sw-KE,sw-TZ,sv-SE,ta-IN,ta-MY,ta-SG,ta-LK,te-IN,th-TH,tr-TR,uk-UA,ur-IN,ur-PK,uz-UZ,vi-VN,zu-ZA"; + + public static Set getSupportedLocales() { + return SUPPORTED_LOCALES; + } + + public static void loadLocales(boolean fromDoc) { + Logger logger = LoggerFactory.getLogger(GoogleSTTLocale.class); + if (!SUPPORTED_LOCALES.isEmpty()) { + logger.debug("Languages already loaded"); + return; + } + if (!fromDoc) { + logger.debug("Loading languages from local"); + loadLocalesFromLocal(); + return; + } + logger.debug("Loading languages from doc"); + try { + URL url = new URL(GC_STT_DOC_LANGUAGES); + HttpURLConnection con = (HttpURLConnection) url.openConnection(); + con.setRequestMethod("GET"); + con.setRequestProperty("Content-Type", "text/html"); + int status = con.getResponseCode(); + if (status != 200) { + logger.warn("Http error loading supported locales, code: {}", status); + loadLocalesFromLocal(); + return; + } + String html = new String(con.getInputStream().readAllBytes()); + Pattern pattern = Pattern.compile("\\(?[a-z]{2})\\-(?[A-Z]{2})\\<\\/td\\>", + Pattern.MULTILINE); + Matcher matcher = pattern.matcher(html); + Locale lastLocale = null; + while (matcher.find()) { + Locale locale = new Locale(matcher.group("lang"), matcher.group("country")); + if (lastLocale == null || !lastLocale.equals(locale)) { + lastLocale = locale; + SUPPORTED_LOCALES.add(locale); + logger.debug("Locale added {}", locale.toLanguageTag()); + } + } + } catch (IOException e) { + logger.warn("Error loading supported locales: {}", e.getMessage()); + loadLocalesFromLocal(); + } + } + + private static void loadLocalesFromLocal() { + Arrays.stream(LOCAL_COPY.split(",")).map((localeTag) -> { + String[] localeTagParts = localeTag.split("-"); + return new Locale(localeTagParts[0], localeTagParts[1]); + }).forEach(SUPPORTED_LOCALES::add); + } +} diff --git a/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTService.java b/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTService.java new file mode 100644 index 000000000..a2f7bbe74 --- /dev/null +++ b/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTService.java @@ -0,0 +1,389 @@ +/** + * Copyright (c) 2010-2022 Contributors to the openHAB project + * + * See the NOTICE file(s) distributed with this work for additional + * information. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0 + * + * SPDX-License-Identifier: EPL-2.0 + */ +package org.openhab.voice.googlestt.internal; + +import static org.openhab.voice.googlestt.internal.GoogleSTTConstants.*; + +import java.io.IOException; +import java.util.*; +import java.util.concurrent.Future; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Consumer; + +import org.eclipse.jdt.annotation.NonNullByDefault; +import org.eclipse.jdt.annotation.Nullable; +import org.openhab.core.audio.AudioFormat; +import org.openhab.core.audio.AudioStream; +import org.openhab.core.auth.client.oauth2.*; +import org.openhab.core.common.ThreadPoolManager; +import org.openhab.core.config.core.ConfigurableService; +import org.openhab.core.config.core.Configuration; +import org.openhab.core.voice.*; +import org.osgi.framework.Constants; +import org.osgi.service.cm.ConfigurationAdmin; +import org.osgi.service.component.annotations.Activate; +import org.osgi.service.component.annotations.Component; +import org.osgi.service.component.annotations.Modified; +import org.osgi.service.component.annotations.Reference; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.api.gax.rpc.ClientStream; +import com.google.api.gax.rpc.ResponseObserver; +import com.google.api.gax.rpc.StreamController; +import com.google.auth.Credentials; +import com.google.auth.oauth2.AccessToken; +import com.google.auth.oauth2.OAuth2Credentials; +import com.google.cloud.speech.v1.*; +import com.google.protobuf.ByteString; + +import io.grpc.LoadBalancerRegistry; +import io.grpc.internal.PickFirstLoadBalancerProvider; + +/** + * The {@link GoogleSTTService} class is a service implementation to use Google Cloud Speech-to-Text features. + * + * @author Miguel Álvarez - Initial contribution + */ +@NonNullByDefault +@Component(configurationPid = SERVICE_PID, property = Constants.SERVICE_PID + "=" + SERVICE_PID) +@ConfigurableService(category = SERVICE_CATEGORY, label = SERVICE_NAME, description_uri = SERVICE_CATEGORY + ":" + + SERVICE_ID) +public class GoogleSTTService implements STTService { + + private static final String GCP_AUTH_URI = "https://accounts.google.com/o/oauth2/auth"; + private static final String GCP_TOKEN_URI = "https://accounts.google.com/o/oauth2/token"; + private static final String GCP_REDIRECT_URI = "urn:ietf:wg:oauth:2.0:oob"; + private static final String GCP_SCOPE = "https://www.googleapis.com/auth/cloud-platform"; + + private final Logger logger = LoggerFactory.getLogger(GoogleSTTService.class); + private final ScheduledExecutorService executor = ThreadPoolManager.getScheduledPool("OH-voice-googlestt"); + private final OAuthFactory oAuthFactory; + private final ConfigurationAdmin configAdmin; + + private GoogleSTTConfiguration config = new GoogleSTTConfiguration(); + private @Nullable OAuthClientService oAuthService; + + @Activate + public GoogleSTTService(final @Reference OAuthFactory oAuthFactory, + final @Reference ConfigurationAdmin configAdmin) { + LoadBalancerRegistry.getDefaultRegistry().register(new PickFirstLoadBalancerProvider()); + this.oAuthFactory = oAuthFactory; + this.configAdmin = configAdmin; + } + + @Activate + protected void activate(Map config) { + this.config = new Configuration(config).as(GoogleSTTConfiguration.class); + executor.submit(() -> GoogleSTTLocale.loadLocales(this.config.refreshSupportedLocales)); + updateConfig(); + } + + @Modified + protected void modified(Map config) { + this.config = new Configuration(config).as(GoogleSTTConfiguration.class); + updateConfig(); + } + + @Override + public String getId() { + return SERVICE_ID; + } + + @Override + public String getLabel(@Nullable Locale locale) { + return SERVICE_NAME; + } + + @Override + public Set getSupportedLocales() { + return GoogleSTTLocale.getSupportedLocales(); + } + + @Override + public Set getSupportedFormats() { + return Set.of( + new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, null, 16000L), + new AudioFormat(AudioFormat.CONTAINER_OGG, "OPUS", null, null, null, 8000L), + new AudioFormat(AudioFormat.CONTAINER_OGG, "OPUS", null, null, null, 12000L), + new AudioFormat(AudioFormat.CONTAINER_OGG, "OPUS", null, null, null, 16000L), + new AudioFormat(AudioFormat.CONTAINER_OGG, "OPUS", null, null, null, 24000L), + new AudioFormat(AudioFormat.CONTAINER_OGG, "OPUS", null, null, null, 48000L)); + } + + @Override + public STTServiceHandle recognize(STTListener sttListener, AudioStream audioStream, Locale locale, + Set set) { + AtomicBoolean keepStreaming = new AtomicBoolean(true); + Future scheduledTask = backgroundRecognize(sttListener, audioStream, keepStreaming, locale, set); + return new STTServiceHandle() { + @Override + public void abort() { + keepStreaming.set(false); + try { + Thread.sleep(100); + } catch (InterruptedException e) { + } + scheduledTask.cancel(true); + } + }; + } + + private void updateConfig() { + String clientId = this.config.clientId; + String clientSecret = this.config.clientSecret; + if (!clientId.isBlank() && !clientSecret.isBlank()) { + var oAuthService = oAuthFactory.createOAuthClientService(SERVICE_PID, GCP_TOKEN_URI, GCP_AUTH_URI, clientId, + clientSecret, GCP_SCOPE, false); + this.oAuthService = oAuthService; + if (!this.config.oauthCode.isEmpty()) { + getAccessToken(oAuthService, this.config.oauthCode); + deleteAuthCode(); + } + } else { + logger.warn("Missing authentication configuration to access Google Cloud STT API."); + } + } + + private void getAccessToken(OAuthClientService oAuthService, String oauthCode) { + logger.debug("Trying to get access and refresh tokens."); + try { + oAuthService.getAccessTokenResponseByAuthorizationCode(oauthCode, GCP_REDIRECT_URI); + } catch (OAuthException | OAuthResponseException e) { + if (logger.isDebugEnabled()) { + logger.debug("Error fetching access token: {}", e.getMessage(), e); + } else { + logger.warn("Error fetching access token. Invalid oauth code? Please generate a new one."); + } + } catch (IOException e) { + logger.warn("An unexpected IOException occurred when fetching access token: {}", e.getMessage()); + } + } + + private void deleteAuthCode() { + try { + org.osgi.service.cm.Configuration serviceConfig = configAdmin.getConfiguration(SERVICE_PID); + Dictionary configProperties = serviceConfig.getProperties(); + if (configProperties != null) { + configProperties.put("oauthCode", ""); + serviceConfig.update(configProperties); + } + } catch (IOException e) { + logger.warn("Failed to delete current oauth code, please delete it manually."); + } + } + + private Future backgroundRecognize(STTListener sttListener, AudioStream audioStream, AtomicBoolean keepStreaming, + Locale locale, Set set) { + Credentials credentials = getCredentials(); + return executor.submit(() -> { + logger.debug("Background recognize starting"); + ClientStream clientStream = null; + try (SpeechClient client = SpeechClient + .create(SpeechSettings.newBuilder().setCredentialsProvider(() -> credentials).build())) { + TranscriptionListener responseObserver = new TranscriptionListener(sttListener, config, + (t) -> keepStreaming.set(false)); + clientStream = client.streamingRecognizeCallable().splitCall(responseObserver); + streamAudio(clientStream, audioStream, responseObserver, keepStreaming, locale); + clientStream.closeSend(); + logger.debug("Background recognize done"); + } catch (IOException e) { + if (clientStream != null && clientStream.isSendReady()) { + clientStream.closeSendWithError(e); + } else if (!config.errorMessage.isBlank()) { + logger.warn("Error running speech to text: {}", e.getMessage()); + sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.errorMessage)); + } + } + }); + } + + private void streamAudio(ClientStream clientStream, AudioStream audioStream, + TranscriptionListener responseObserver, AtomicBoolean keepStreaming, Locale locale) throws IOException { + // Gather stream info and send config + AudioFormat streamFormat = audioStream.getFormat(); + RecognitionConfig.AudioEncoding streamEncoding; + if (AudioFormat.WAV.isCompatible(streamFormat)) { + streamEncoding = RecognitionConfig.AudioEncoding.LINEAR16; + } else if (AudioFormat.OGG.isCompatible(streamFormat)) { + streamEncoding = RecognitionConfig.AudioEncoding.OGG_OPUS; + } else { + logger.debug("Unsupported format {}", streamFormat); + return; + } + Integer channelsObject = streamFormat.getChannels(); + int channels = channelsObject != null ? channelsObject : 1; + Long longFrequency = streamFormat.getFrequency(); + if (longFrequency == null) { + logger.debug("Missing frequency info"); + return; + } + int frequency = Math.toIntExact(longFrequency); + // First thing we need to send the stream config + sendStreamConfig(clientStream, streamEncoding, frequency, channels, locale); + // Loop sending audio data + long startTime = System.currentTimeMillis(); + long maxTranscriptionMillis = (config.maxTranscriptionSeconds * 1000L); + long maxSilenceMillis = (config.maxSilenceSeconds * 1000L); + int readBytes = 6400; + while (keepStreaming.get()) { + byte[] data = new byte[readBytes]; + int dataN = audioStream.read(data); + if (!keepStreaming.get() || isExpiredInterval(maxTranscriptionMillis, startTime)) { + logger.debug("Stops listening, max transcription time reached"); + break; + } + if (!config.singleUtteranceMode + && isExpiredInterval(maxSilenceMillis, responseObserver.getLastInputTime())) { + logger.debug("Stops listening, max silence time reached"); + break; + } + if (dataN != readBytes) { + try { + Thread.sleep(100); + } catch (InterruptedException e) { + } + continue; + } + StreamingRecognizeRequest dataRequest = StreamingRecognizeRequest.newBuilder() + .setAudioContent(ByteString.copyFrom(data)).build(); + logger.debug("Sending audio data {}", dataN); + clientStream.send(dataRequest); + } + } + + private void sendStreamConfig(ClientStream clientStream, + RecognitionConfig.AudioEncoding encoding, int sampleRate, int channels, Locale locale) { + RecognitionConfig recognitionConfig = RecognitionConfig.newBuilder().setEncoding(encoding) + .setAudioChannelCount(channels).setLanguageCode(locale.toLanguageTag()).setSampleRateHertz(sampleRate) + .build(); + + StreamingRecognitionConfig streamingRecognitionConfig = StreamingRecognitionConfig.newBuilder() + .setConfig(recognitionConfig).setInterimResults(false).setSingleUtterance(config.singleUtteranceMode) + .build(); + + clientStream + .send(StreamingRecognizeRequest.newBuilder().setStreamingConfig(streamingRecognitionConfig).build()); + } + + private @Nullable Credentials getCredentials() { + String accessToken = null; + try { + OAuthClientService oAuthService = this.oAuthService; + if (oAuthService != null) { + AccessTokenResponse response = oAuthService.getAccessTokenResponse(); + if (response != null) { + accessToken = response.getAccessToken(); + } + } + } catch (OAuthException | IOException | OAuthResponseException e) { + logger.warn("Access token error: {}", e.getMessage()); + } + if (accessToken == null) { + logger.warn("Missed google cloud access token"); + return null; + } + return OAuth2Credentials.create(new AccessToken(accessToken, null)); + } + + private boolean isExpiredInterval(long interval, long referenceTime) { + return System.currentTimeMillis() - referenceTime > interval; + } + + private static class TranscriptionListener implements ResponseObserver { + private final Logger logger = LoggerFactory.getLogger(TranscriptionListener.class); + private final StringBuilder transcriptBuilder = new StringBuilder(); + private final STTListener sttListener; + GoogleSTTConfiguration config; + private final Consumer<@Nullable Throwable> completeListener; + private float confidenceSum = 0; + private int responseCount = 0; + private long lastInputTime = 0; + + public TranscriptionListener(STTListener sttListener, GoogleSTTConfiguration config, + Consumer<@Nullable Throwable> completeListener) { + this.sttListener = sttListener; + this.config = config; + this.completeListener = completeListener; + } + + public void onStart(@Nullable StreamController controller) { + sttListener.sttEventReceived(new SpeechStartEvent()); + lastInputTime = System.currentTimeMillis(); + } + + public void onResponse(StreamingRecognizeResponse response) { + lastInputTime = System.currentTimeMillis(); + List results = response.getResultsList(); + logger.debug("Got {} results", response.getResultsList().size()); + if (results.isEmpty()) { + logger.debug("No results"); + return; + } + results.forEach(result -> { + List alternatives = result.getAlternativesList(); + logger.debug("Got {} alternatives", alternatives.size()); + SpeechRecognitionAlternative alternative = alternatives.stream() + .max(Comparator.comparing(SpeechRecognitionAlternative::getConfidence)).orElse(null); + if (alternative == null) { + return; + } + String transcript = alternative.getTranscript(); + logger.debug("Alternative transcript: {}", transcript); + logger.debug("Alternative confidence: {}", alternative.getConfidence()); + if (result.getIsFinal()) { + transcriptBuilder.append(transcript); + confidenceSum += alternative.getConfidence(); + responseCount++; + // when in single utterance mode we can just get one final result so complete + if (config.singleUtteranceMode) { + completeListener.accept(null); + } + } + }); + } + + public void onComplete() { + sttListener.sttEventReceived(new SpeechStopEvent()); + float averageConfidence = confidenceSum / (float) responseCount; + String transcript = transcriptBuilder.toString(); + if (!transcript.isBlank()) { + sttListener.sttEventReceived(new SpeechRecognitionEvent(transcript, averageConfidence)); + } else { + if (!config.noResultsMessage.isBlank()) { + sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.noResultsMessage)); + } else { + sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("No results")); + } + } + } + + public void onError(@Nullable Throwable t) { + logger.warn("Recognition error: ", t); + completeListener.accept(t); + sttListener.sttEventReceived(new SpeechStopEvent()); + if (!config.errorMessage.isBlank()) { + sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.errorMessage)); + } else { + String errorMessage = t.getMessage(); + sttListener.sttEventReceived( + new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error")); + } + } + + public long getLastInputTime() { + return lastInputTime; + } + } +} diff --git a/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/config/config.xml b/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/config/config.xml new file mode 100644 index 000000000..171147383 --- /dev/null +++ b/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/config/config.xml @@ -0,0 +1,67 @@ + + + + + + + Authentication for connecting to Google Cloud Platform. + + + + Configure Speech to Text. + + + + Configure service information messages. + + + + Google Cloud Platform OAuth 2.0-Client Id. + + + password + + Google Cloud Platform OAuth 2.0-Client Secret. + + + + Please go to your browser ... https://accounts.google.com/o/oauth2/auth?client_id=\&redirect_uri=urn:ietf:wg:oauth:2.0:oob&scope=https://www.googleapis.com/auth/cloud-platform&response_type=code ... to generate an auth-code and paste it here.]]> + + + + When enabled Google Cloud Platform is responsible for detecting when to stop listening after a single + utterance. (Recommended) + true + + + + Max seconds to wait to force stop the transcription. + 60 + + + + Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop + listening. + 5 + + + + Try loading supported locales from the documentation page. + false + + + + Message to be told when no results. (Empty for disabled) + Sorry, I didn't understand you + + + + Message to be told when an error has happened. (Empty for disabled) + Sorry, something went wrong + + + diff --git a/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/i18n/googlestt.properties b/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/i18n/googlestt.properties new file mode 100644 index 000000000..c8ac8005a --- /dev/null +++ b/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/i18n/googlestt.properties @@ -0,0 +1,28 @@ +voice.config.googlestt.clientId.label = Client Id +voice.config.googlestt.clientId.description = Google Cloud Platform OAuth 2.0-Client Id. +voice.config.googlestt.clientSecret.label = Client Secret +voice.config.googlestt.clientSecret.description = Google Cloud Platform OAuth 2.0-Client Secret. +voice.config.googlestt.errorMessage.label = Error Message +voice.config.googlestt.errorMessage.description = Message to be told when an error has happened. (Empty for disabled) +voice.config.googlestt.group.authentication.label = Authentication +voice.config.googlestt.group.authentication.description = Authentication for connecting to Google Cloud Platform. +voice.config.googlestt.group.messages.label = Info Messages +voice.config.googlestt.group.messages.description = Configure service information messages. +voice.config.googlestt.group.stt.label = STT Configuration +voice.config.googlestt.group.stt.description = Configure Speech to Text. +voice.config.googlestt.maxSilenceSeconds.label = Max Silence Seconds +voice.config.googlestt.maxSilenceSeconds.description = Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop listening. +voice.config.googlestt.maxTranscriptionSeconds.label = Max Transcription Seconds +voice.config.googlestt.maxTranscriptionSeconds.description = Max seconds to wait to force stop the transcription. +voice.config.googlestt.noResultsMessage.label = No Results Message +voice.config.googlestt.noResultsMessage.description = Message to be told when no results. (Empty for disabled) +voice.config.googlestt.oauthCode.label = Authorization Code +voice.config.googlestt.oauthCode.description = The oauth code is a one-time code needed to retrieve the necessary access token from Google Cloud Platform. Please go to your browser ... https://accounts.google.com/o/oauth2/auth?client_id=\&redirect_uri=urn:ietf:wg:oauth:2.0:oob&scope=https://www.googleapis.com/auth/cloud-platform&response_type=code ... to generate an auth-code and paste it here. +voice.config.googlestt.refreshSupportedLocales.label = Refresh Supported Locales +voice.config.googlestt.refreshSupportedLocales.description = Try loading supported locales from the documentation page. +voice.config.googlestt.singleUtteranceMode.label = Single Utterance Mode +voice.config.googlestt.singleUtteranceMode.description = When enabled Google Cloud Platform is responsible for detecting when to stop listening after a single utterance. (Recommended) + +# service + +service.voice.googlestt.label = Google Cloud Speech-to-Text diff --git a/bundles/pom.xml b/bundles/pom.xml index 4da92f059..5ef44f682 100644 --- a/bundles/pom.xml +++ b/bundles/pom.xml @@ -392,6 +392,7 @@ org.openhab.persistence.mongodb org.openhab.persistence.rrd4j + org.openhab.voice.googlestt org.openhab.voice.googletts org.openhab.voice.mactts org.openhab.voice.marytts