org.openhab.voice.googletts.internal.GoogleCloudAPI.java Source code

Introduction

Here is the source code for org.openhab.voice.googletts.internal.GoogleCloudAPI.java
Source

/**
 * Copyright (c) 2010-2019 Contributors to the openHAB project
 *
 * See the NOTICE file(s) distributed with this work for additional
 * information.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License 2.0 which is available at
 * http://www.eclipse.org/legal/epl-2.0
 *
 * SPDX-License-Identifier: EPL-2.0
 */
package org.openhab.voice.googletts.internal;

import static java.util.Collections.*;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.FilenameUtils;
import org.eclipse.smarthome.core.audio.AudioFormat;
import org.eclipse.smarthome.io.net.http.HttpRequestBuilder;
import org.openhab.voice.googletts.internal.protocol.AudioConfig;
import org.openhab.voice.googletts.internal.protocol.AudioEncoding;
import org.openhab.voice.googletts.internal.protocol.ListVoicesResponse;
import org.openhab.voice.googletts.internal.protocol.SsmlVoiceGender;
import org.openhab.voice.googletts.internal.protocol.SynthesisInput;
import org.openhab.voice.googletts.internal.protocol.SynthesizeSpeechRequest;
import org.openhab.voice.googletts.internal.protocol.SynthesizeSpeechResponse;
import org.openhab.voice.googletts.internal.protocol.Voice;
import org.openhab.voice.googletts.internal.protocol.VoiceSelectionParams;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.api.gax.core.FixedCredentialsProvider;
import com.google.auth.Credentials;
import com.google.auth.oauth2.GoogleCredentials;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;

/**
 * Google Cloud TTS API call implementation.
 *
 * @author Gabor Bicskei - Initial contribution and API
 */
class GoogleCloudAPI {
    /**
     * Default encoding
     */
    private static final String UTF_8 = "UTF-8";

    /**
     * JSON content type
     */
    private static final String APPLICATION_JSON = "application/json";

    /**
     * Authorization header
     */
    private static final String AUTH_HEADER_NAME = "Authorization";

    /**
     * Google Cloud Platform authorization scope
     */
    private static final String GCP_SCOPE = "https://www.googleapis.com/auth/cloud-platform";

    /**
     * URL used for retrieving the list of available voices
     */
    private static final String LIST_VOICES_URL = "https://texttospeech.googleapis.com/v1/voices";

    /**
     * URL used for synthesizing text to speech
     */
    private static final String SYTNHESIZE_SPEECH_URL = "https://texttospeech.googleapis.com/v1/text:synthesize";

    /**
     * Logger
     */
    private final Logger logger = LoggerFactory.getLogger(GoogleCloudAPI.class);

    /**
     * Supported voices and locales
     */
    private final Map<Locale, Set<GoogleTTSVoice>> voices = new HashMap<>();

    /**
     * Cache folder
     */
    private File cacheFolder;

    /**
     * Configuration
     */
    private GoogleTTSConfig config;

    /**
     * Status flag
     */
    private boolean initialized;

    private Credentials credentials;

    private final Gson gson = new GsonBuilder().create();

    /**
     * Constructor.
     *
     * @param cacheFolder Service cache folder
     */
    GoogleCloudAPI(File cacheFolder) {
        this.cacheFolder = cacheFolder;
    }

    /**
     * Configuration update.
     *
     * @param config New configuration.
     */
    void setConfig(GoogleTTSConfig config) {
        this.config = config;
        String serviceAccountKey = config.getServiceAccountKey();
        if (serviceAccountKey != null && !serviceAccountKey.isEmpty()) {
            try {
                credentials = createCredentials(serviceAccountKey);
                initialized = true;
                initVoices();
            } catch (IOException e) {
                logger.error("Error initializing the service", e);
                initialized = false;
            }
        } else {
            credentials = null;
            initialized = false;
            voices.clear();
        }

        // maintain cache
        if (config.getPurgeCache() != null && config.getPurgeCache()) {
            File[] files = cacheFolder.listFiles();
            if (files != null && files.length > 0) {
                Arrays.stream(files).forEach(File::delete);
            }
            logger.debug("Cache purged.");
        }
    }

    private Credentials createCredentials(String serviceAccountKey) throws IOException {
        try (ByteArrayInputStream bis = new ByteArrayInputStream(serviceAccountKey.getBytes())) {
            GoogleCredentials credential = GoogleCredentials.fromStream(bis)
                    .createScoped(Collections.singleton(GCP_SCOPE));
            return FixedCredentialsProvider.create(credential).getCredentials();
        }
    }

    private String getAuthorization() throws IOException {
        Map<String, List<String>> metadata = credentials.getRequestMetadata();
        return metadata.get(AUTH_HEADER_NAME).get(0);
    }

    /**
     * Loads supported audio formats
     *
     * @return Set of audio formats
     */
    Set<String> getSupportedAudioFormats() {
        Set<String> formats = new HashSet<>();
        for (AudioEncoding audioEncoding : AudioEncoding.values()) {
            if (audioEncoding != AudioEncoding.AUDIO_ENCODING_UNSPECIFIED) {
                formats.add(audioEncoding.toString());
            }
        }
        return formats;
    }

    /**
     * Supported locales.
     *
     * @return Set of locales
     */
    Set<Locale> getSupportedLocales() {
        return unmodifiableSet(voices.keySet());
    }

    /**
     * Supported voices for locale.
     *
     * @param locale Locale
     * @return Set of voices
     */
    Set<GoogleTTSVoice> getVoicesForLocale(Locale locale) {
        Set<GoogleTTSVoice> localeVoices = voices.get(locale);
        return localeVoices != null ? unmodifiableSet(localeVoices) : emptySet();
    }

    /**
     * Google API call to load locales and voices.
     */
    private void initVoices() throws IOException {
        if (credentials != null) {
            voices.clear();

            for (GoogleTTSVoice voice : listVoices()) {
                Locale locale = voice.getLocale();
                Set<GoogleTTSVoice> localeVoices;
                if (!voices.containsKey(locale)) {
                    localeVoices = new HashSet<>();
                    voices.put(locale, localeVoices);
                } else {
                    localeVoices = voices.get(locale);
                }
                localeVoices.add(voice);
            }
        } else {
            logger.error("Google client is not initialized!");
        }
    }

    @SuppressWarnings({ "unused", "null" })
    private List<GoogleTTSVoice> listVoices() throws IOException {
        HttpRequestBuilder builder = HttpRequestBuilder.getFrom(LIST_VOICES_URL).withHeader(AUTH_HEADER_NAME,
                getAuthorization());

        ListVoicesResponse listVoicesResponse = gson.fromJson(builder.getContentAsString(),
                ListVoicesResponse.class);

        if (listVoicesResponse == null || listVoicesResponse.getVoices() == null) {
            return emptyList();
        }

        List<GoogleTTSVoice> result = new ArrayList<>();
        for (Voice voice : listVoicesResponse.getVoices()) {
            for (String languageCode : voice.getLanguageCodes()) {
                result.add(new GoogleTTSVoice(Locale.forLanguageTag(languageCode), voice.getName(),
                        voice.getSsmlGender().name()));
            }
        }

        return result;
    }

    /**
     * Converts ESH audio format to Google parameters.
     *
     * @param codec Requested codec
     * @return String array of Google audio format and the file extension to use.
     */
    private String[] getFormatForCodec(String codec) {
        switch (codec) {
        case AudioFormat.CODEC_MP3:
            return new String[] { AudioEncoding.MP3.toString(), "mp3" };
        case AudioFormat.CODEC_PCM_SIGNED:
            return new String[] { AudioEncoding.LINEAR16.toString(), "wav" };
        default:
            throw new IllegalArgumentException("Audio format " + codec + " is not yet supported");
        }
    }

    byte[] synthesizeSpeech(String text, GoogleTTSVoice voice, String codec) {
        String[] format = getFormatForCodec(codec);
        String fileNameInCache = getUniqueFilenameForText(text, voice.getTechnicalName());
        File audioFileInCache = new File(cacheFolder, fileNameInCache + "." + format[1]);
        try {
            // check if in cache
            if (audioFileInCache.exists()) {
                logger.debug("Audio file {} was found in cache.", audioFileInCache.getName());
                return Files.readAllBytes(audioFileInCache.toPath());
            }

            // if not in cache, get audio data and put to cache
            byte[] audio = synthesizeSpeechByGoogle(text, voice, format[0]);
            if (audio != null) {
                saveAudioAndTextToFile(text, audioFileInCache, audio, voice.getTechnicalName());
            }
            return audio;
        } catch (FileNotFoundException ex) {
            logger.warn("Could not write {} to cache", audioFileInCache, ex);
            return null;
        } catch (IOException ex) {
            logger.error("Could not write {}to cache", audioFileInCache, ex);
            return null;
        }
    }

    /**
     * Create cache entry.
     *
     * @param text Converted text.
     * @param cacheFile Cache entry file.
     * @param audio Byte array of the audio.
     * @param voiceName Used voice
     * @throws IOException in case of file handling exceptions
     */
    private void saveAudioAndTextToFile(String text, File cacheFile, byte[] audio, String voiceName)
            throws IOException {
        logger.debug("Caching audio file {}", cacheFile.getName());
        try (FileOutputStream audioFileOutputStream = new FileOutputStream(cacheFile)) {
            audioFileOutputStream.write(audio);
        }

        // write text to file for transparency too
        // this allows to know which contents is in which audio file
        String textFileName = FilenameUtils.removeExtension(cacheFile.getName()) + ".txt";
        logger.debug("Caching text file {}", textFileName);
        try (FileOutputStream textFileOutputStream = new FileOutputStream(new File(cacheFolder, textFileName))) {
            // @formatter:off
            StringBuilder sb = new StringBuilder("Config: ").append(config.toConfigString()).append(",voice=")
                    .append(voiceName).append(System.lineSeparator()).append("Text: ").append(text)
                    .append(System.lineSeparator());
            // @formatter:on
            textFileOutputStream.write(sb.toString().getBytes(UTF_8));
        }
    }

    /**
     * Call Google service to synthesize the required text
     *
     * @param text Text to synthesize
     * @param voice Voice parameter
     * @param audioFormat Audio encoding format
     * @return Audio input stream or {@code null} when encoding exceptions occur
     */
    @SuppressWarnings({ "unused", "null" })
    private byte[] synthesizeSpeechByGoogle(String text, GoogleTTSVoice voice, String audioFormat)
            throws IOException {
        AudioConfig audioConfig = new AudioConfig(AudioEncoding.valueOf(audioFormat), config.getPitch(),
                config.getSpeakingRate(), config.getVolumeGainDb());
        SynthesisInput synthesisInput = new SynthesisInput(text);
        VoiceSelectionParams voiceSelectionParams = new VoiceSelectionParams(voice.getLocale().getLanguage(),
                voice.getLabel(), SsmlVoiceGender.valueOf(voice.getSsmlGender()));

        SynthesizeSpeechRequest request = new SynthesizeSpeechRequest(audioConfig, synthesisInput,
                voiceSelectionParams);

        HttpRequestBuilder builder = HttpRequestBuilder.postTo(SYTNHESIZE_SPEECH_URL)
                .withHeader(AUTH_HEADER_NAME, getAuthorization())
                .withContent(gson.toJson(request), APPLICATION_JSON);

        SynthesizeSpeechResponse synthesizeSpeechResponse = gson.fromJson(builder.getContentAsString(),
                SynthesizeSpeechResponse.class);

        if (synthesizeSpeechResponse == null) {
            return null;
        }

        byte[] encodedBytes = synthesizeSpeechResponse.getAudioContent().getBytes(StandardCharsets.UTF_8);
        return Base64.getDecoder().decode(encodedBytes);
    }

    /**
     * Gets a unique filename for a give text, by creating a MD5 hash of it. It
     * will be preceded by the locale.
     * <p>
     * Sample: "en-US_00a2653ac5f77063bc4ea2fee87318d3"
     */
    private String getUniqueFilenameForText(String text, String voiceName) {
        try {
            byte[] bytesOfMessage = (config.toConfigString() + text).getBytes(UTF_8);
            MessageDigest md = MessageDigest.getInstance("MD5");
            byte[] md5Hash = md.digest(bytesOfMessage);
            BigInteger bigInt = new BigInteger(1, md5Hash);
            StringBuilder hashText = new StringBuilder(bigInt.toString(16));
            // Now we need to zero pad it if you actually want the full 32 chars.
            while (hashText.length() < 32) {
                hashText.insert(0, "0");
            }
            return voiceName + "_" + hashText;
        } catch (UnsupportedEncodingException | NoSuchAlgorithmException ex) {
            // should not happen
            logger.error("Could not create MD5 hash for '{}'", text, ex);
            return null;
        }
    }

    boolean isInitialized() {
        return initialized;
    }

}