org.zanata.magpie.service.PersistentTranslationService.java Source code

Java tutorial

Introduction

Here is the source code for org.zanata.magpie.service.PersistentTranslationService.java

Source

/*
 * Copyright 2017, Red Hat, Inc. and individual contributors
 * as indicated by the @author tags. See the copyright.txt file in the
 * distribution for a full listing of individual contributors.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package org.zanata.magpie.service;

import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;

import javax.enterprise.context.ApplicationScoped;
import javax.enterprise.event.Event;
import javax.enterprise.inject.Instance;
import javax.inject.Inject;
import javax.validation.constraints.NotNull;
import javax.ws.rs.BadRequestException;

import org.apache.deltaspike.jpa.api.transaction.Transactional;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.zanata.magpie.api.AuthenticatedAccount;
import org.zanata.magpie.backend.BackendLocaleCode;
import org.zanata.magpie.dao.DocumentDAO;
import org.zanata.magpie.dao.TextFlowDAO;
import org.zanata.magpie.dao.TextFlowTargetDAO;
import org.zanata.magpie.event.RequestedMTEvent;
import org.zanata.magpie.exception.MTException;
import org.zanata.magpie.model.AugmentedTranslation;
import org.zanata.magpie.model.BackendID;
import org.zanata.magpie.model.Document;
import org.zanata.magpie.model.Locale;
import org.zanata.magpie.model.StringType;
import org.zanata.magpie.model.TextFlow;
import org.zanata.magpie.model.TextFlowTarget;
import org.zanata.magpie.util.HashUtil;
import org.zanata.magpie.util.ShortString;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;

import static org.apache.commons.lang3.exception.ExceptionUtils.getThrowableList;

/**
 * @author Alex Eng <a href="mailto:aeng@redhat.com">aeng@redhat.com</a>
 * @author Patrick Huang <a href="mailto:pahuang@redhat.com">pahuang@redhat.com</a>
 */
@ApplicationScoped
public class PersistentTranslationService {
    private static final Logger LOG = LoggerFactory.getLogger(PersistentTranslationService.class);

    private DocumentDAO documentDAO;
    private TextFlowDAO textFlowDAO;
    private TextFlowTargetDAO textFlowTargetDAO;
    private Event<RequestedMTEvent> requestedMTEvent;
    private AuthenticatedAccount authenticatedAccount;

    private Map<BackendID, TranslatorBackend> translatorBackendMap;

    @SuppressWarnings("unused")
    public PersistentTranslationService() {
    }

    // Note that @Dependent Instances have to be destroyed[1], but only the
    // @Dependent ones[2], and yet there is no easy way of identifying them
    // (short of using WeldInstance[3]). So make sure TranslatorBackends are
    // not @Dependent scoped!
    // [1] https://blog.akquinet.de/2017/01/04/dont-get-trapped-into-a-memory-leak-using-cdi-instance-injection/
    // [2] https://developer.jboss.org/thread/252712
    // [3] http://weld.cdi-spec.org/news/2016/05/18/enhanced-instance/
    @Inject
    public PersistentTranslationService(DocumentDAO documentDAO, TextFlowDAO textFlowDAO,
            TextFlowTargetDAO textFlowTargetDAO, Instance<TranslatorBackend> translatorBackends,
            Event<RequestedMTEvent> requestedMTEvent, AuthenticatedAccount authenticatedAccount) {
        this.documentDAO = documentDAO;
        this.textFlowDAO = textFlowDAO;
        this.textFlowTargetDAO = textFlowTargetDAO;
        this.requestedMTEvent = requestedMTEvent;
        this.authenticatedAccount = authenticatedAccount;

        Map<BackendID, TranslatorBackend> backendMap = new HashMap<>();
        for (TranslatorBackend backend : translatorBackends) {
            backendMap.put(backend.getId(), backend);
        }

        translatorBackendMap = Collections.unmodifiableMap(backendMap);
    }

    /**
     * Translate multiple string in an api trigger
     *
     * Get from database if exists (hash) from same document,
     * if not exist, get latest TF from DB with matching hash,
     * else from MT engine
     */
    @Transactional
    public List<String> translate(@NotNull Document document, @NotNull List<String> sourceStrings,
            @NotNull Locale fromLocale, @NotNull Locale toLocale, @NotNull BackendID backendID,
            @NotNull StringType stringType, Optional<String> category) throws BadRequestException, MTException {
        // fetch the text flows for later (as part of this new transaction)
        document = documentDAO.reload(document);
        document.getTextFlows();
        if (sourceStrings == null || sourceStrings.isEmpty() || fromLocale == null || toLocale == null
                || backendID == null) {
            throw new BadRequestException();
        }
        if (!authenticatedAccount.hasAuthenticatedAccount()) {
            throw new MTException("not authenticated account trying to trigger MT translation");
        }

        // get translator backend for MT engine by requested backend id
        TranslatorBackend translatorBackend = getTranslatorBackend(backendID);

        BackendLocaleCode mappedFromLocaleCode = translatorBackend.getMappedLocale(fromLocale.getLocaleCode());
        BackendLocaleCode mappedToLocaleCode = translatorBackend.getMappedLocale(toLocale.getLocaleCode());

        List<String> results = new ArrayList<>(sourceStrings);
        Multimap<String, Integer> untranslatedIndexMap = ArrayListMultimap.create();

        Map<Integer, TextFlow> indexTextFlowMap = Maps.newHashMap();

        // search from database
        int matchCount = 0;
        for (int sourceStringIndex = 0; sourceStringIndex < sourceStrings.size(); sourceStringIndex++) {
            String string = sourceStrings.get(sourceStringIndex);
            String contentHash = HashUtil.generateHash(string);
            Optional<TextFlow> matchedHashTf = tryFindTextFlowByContentHashFromDB(document, fromLocale, toLocale,
                    backendID, contentHash);

            if (matchedHashTf.isPresent()) {
                // we found a matching text flow in database
                // now check to see if it has translation from the same provider
                TextFlow matchedTf = matchedHashTf.get();
                Optional<TextFlowTarget> matchedTarget = findTargetByLocaleAndProvider(toLocale, backendID,
                        matchedTf);

                if (matchedTarget.isPresent()) {
                    TextFlowTarget matchedEntity = matchedTarget.get();
                    matchCount++;
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Found match, Source {}:{}:{}\nTranslation {}:{}", fromLocale.getLocaleCode(),
                                document.getUrl(), ShortString.shorten(string), toLocale.getLocaleCode(),
                                ShortString.shorten(matchedEntity.getContent()));
                    }

                    results.set(sourceStringIndex, matchedEntity.getContent());
                } else {
                    untranslatedIndexMap.put(string, sourceStringIndex);
                    indexTextFlowMap.put(sourceStringIndex, matchedTf);
                }
            } else {
                untranslatedIndexMap.put(string, sourceStringIndex);
            }
        }
        LOG.info("found {} of match sources and translations in database", matchCount);

        // see if we got all translations from database records
        if (untranslatedIndexMap.isEmpty()) {
            return results;
        }

        // translate using requested MT engine
        List<String> sourcesToTranslate = new ArrayList<>(untranslatedIndexMap.keySet());
        Date engineInvokeTime = new Date();
        List<AugmentedTranslation> translations = translatorBackend.translate(sourcesToTranslate,
                mappedFromLocaleCode, mappedToLocaleCode, stringType, category);

        LOG.info("triggered MT engine {} from {} to {}", backendID, fromLocale.getLocaleCode(),
                toLocale.getLocaleCode());

        List<String> requestedTextFlows = Lists.newLinkedList();
        long wordCount = 0;
        long charCount = 0;
        for (int i = 0; i < sourcesToTranslate.size(); i++) {
            String source = sourcesToTranslate.get(i);
            AugmentedTranslation translation = translations.get(i);
            // same string may appear several times in a document therefore has several indexes
            Collection<Integer> indexes = untranslatedIndexMap.get(source);
            indexes.forEach(j -> results.set(j, translation.getPlainTranslation()));

            // see if we already have a matched text flow
            // (either in the same document or copied from other document)
            TextFlow tf = indexTextFlowMap.get(indexes.iterator().next());

            try {
                if (tf == null) {
                    tf = createTextFlow(document, source, fromLocale);
                }
                wordCount += tf.getWordCount();
                charCount += tf.getCharCount();
                requestedTextFlows.add(tf.getContentHash());
                TextFlowTarget target = new TextFlowTarget(translation.getPlainTranslation(),
                        translation.getRawTranslation(), tf, toLocale, backendID);
                createOrUpdateTextFlowTarget(target);
            } catch (Exception e) {
                List<Throwable> causalChain = getThrowableList(e);
                Optional<Throwable> duplicateKeyEx = causalChain.stream()
                        .filter(t -> t instanceof SQLException && t.getMessage() != null
                                && t.getMessage().contains("duplicate key value violates unique constraint"))
                        .findAny();
                if (duplicateKeyEx.isPresent()) {
                    LOG.warn("concurrent requests for document {}", document.getUrl());
                    // we ignore the failed update
                    // TODO prevent duplicates from reaching DB: ZNTAMT-51
                }
            }
        }
        requestedMTEvent.fire(new RequestedMTEvent(document, requestedTextFlows, backendID, engineInvokeTime,
                authenticatedAccount.getAuthenticatedAccount().get(), wordCount, charCount));

        return results;
    }

    private @NotNull TranslatorBackend getTranslatorBackend(@NotNull BackendID backendID) {
        if (translatorBackendMap.containsKey(backendID)) {
            return translatorBackendMap.get(backendID);
        }
        throw new BadRequestException("Unsupported backendId: " + backendID);
    }

    /**
     * See if the document has the text flow in database already. If not, try to
     * search same content hash from other documents. If any is found, try to
     * copy text flow and text flow target. If none is found, return empty.
     *
     * @param document
     *            current document
     * @param fromLocale
     *            from locale
     * @param toLocale
     *            to locale
     * @param backendID
     *            translation provider
     * @param contentHash
     *            text flow content hash
     * @return optional text flow that has the matching content hash
     */
    private Optional<TextFlow> tryFindTextFlowByContentHashFromDB(@NotNull Document document,
            @NotNull Locale fromLocale, @NotNull Locale toLocale, @NotNull BackendID backendID,
            String contentHash) {
        TextFlow matchedHashTf = document.getTextFlows().get(contentHash);
        if (matchedHashTf == null) {
            // we don't have text flow for this document yet,
            // now try to search similar text flow from database
            Optional<TextFlow> tfCopy = tryCopyTextFlowAndTargetFromDB(document, fromLocale, toLocale, contentHash,
                    backendID);

            matchedHashTf = tfCopy.orElse(null);
        }
        return Optional.ofNullable(matchedHashTf);
    }

    public int getMaxLength(@NotNull BackendID backendID) {
        return getTranslatorBackend(backendID).getCharLimitPerRequest();
    }

    /**
     * Find matching contentHash and create a new copy of TextFlow and
     * TextFlowTarget if it is not from the same document. Otherwise, return the
     * same copy. If there is not matching contentHash, return empty.
     *
     * TODO: refactor TextFlow to use pos to allow duplication of content
     */
    private Optional<TextFlow> tryCopyTextFlowAndTargetFromDB(Document document, Locale fromLocale, Locale toLocale,
            String contentHash, BackendID backendID) {
        Optional<TextFlow> textFlow = textFlowDAO.getLatestByContentHash(fromLocale.getLocaleCode(), contentHash);
        if (textFlow.isPresent()) {
            if (textFlow.get().getDocument().equals(document)) {
                // this document already has this text flow
                return textFlow;
            } else {
                // found a matching text flow from different document
                // copy textFlow and possible target textFlowTarget
                TextFlow newTfCopy = new TextFlow(document, textFlow.get().getContent(), fromLocale);
                Optional<TextFlowTarget> matchedTft = findTargetByLocaleAndProvider(toLocale, backendID,
                        textFlow.get());
                if (matchedTft.isPresent()) {
                    TextFlowTarget tft = matchedTft.get();
                    newTfCopy.getTargets().add(new TextFlowTarget(tft.getContent(), tft.getRawContent(), newTfCopy,
                            toLocale, backendID));
                }
                newTfCopy = textFlowDAO.persist(newTfCopy);
                document.getTextFlows().put(newTfCopy.getContentHash(), newTfCopy);
                return Optional.of(newTfCopy);
            }
        }
        return Optional.empty();
    }

    private TextFlow createTextFlow(Document document, String source, Locale locale) {
        TextFlow tf = new TextFlow(document, source, locale);
        tf = textFlowDAO.persist(tf);
        return tf;
    }

    /**
     * If found matching TextFlowTarget (locale + backendId),
     * update the content and rawContent, else create new TextFlowTarget
     */
    private void createOrUpdateTextFlowTarget(TextFlowTarget tft) {
        TextFlow tf = tft.getTextFlow();
        Optional<TextFlowTarget> existingTarget = textFlowTargetDAO.findTarget(tf, tft.getLocale(),
                tft.getBackendId());
        if (existingTarget.isPresent()) {
            existingTarget.get().updateContent(tft.getContent(), tft.getRawContent());
        } else {
            textFlowTargetDAO.persist(tft);
        }
    }

    private static Optional<TextFlowTarget> findTargetByLocaleAndProvider(Locale toLocale, BackendID backendID,
            TextFlow textFlow) {
        return textFlow.getTargets().stream()
                .filter(target -> target.getLocale().equals(toLocale) && target.getBackendId().equals(backendID))
                .findAny();
    }
}