com.qwazr.extractor.ParserAbstract.java Source code

Java tutorial

Introduction

Here is the source code for com.qwazr.extractor.ParserAbstract.java

Source

/**
 * Copyright 2015-2017 Emmanuel Keller / QWAZR
 * <p/>
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.qwazr.extractor;

import com.qwazr.utils.Language;
import com.qwazr.utils.StringUtils;
import org.apache.commons.io.IOUtils;

import javax.ws.rs.core.MultivaluedMap;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.function.Function;

public abstract class ParserAbstract implements ParserInterface {

    private final String name;

    protected ParserAbstract() {
        name = StringUtils.removeEnd(this.getClass().getSimpleName(), "Parser").toLowerCase();
    }

    protected String findMimeType(final String extension, final String mimeType,
            final Function<String, String> extensionToMimeType) {
        if (mimeType != null)
            return mimeType;
        if (extension == null)
            return null;
        return extensionToMimeType.apply(extension);
    }

    protected String findMimeTypeUsingDefault(String extension) {
        final String[] extensions = getDefaultExtensions();
        final String[] mimeTypes = getDefaultMimeTypes();
        if (extension == null || mimeTypes == null || extensions.length != mimeTypes.length)
            return null;
        int i = 0;
        for (String ext : extensions) {
            if (extension.equals(ext))
                return mimeTypes[i];
            else
                i++;
        }
        return null;
    }

    protected String getParameterValue(final MultivaluedMap<String, String> parameters, final ParserField param,
            final int position) {
        if (parameters == null)
            return null;
        final List<String> values = parameters.get(param.name);
        if (values == null)
            return null;
        if (position >= values.size())
            return null;
        return values.get(position);
    }

    @Override
    public final String getName() {
        return name;
    }

    protected static Path createTempFile(final InputStream inputStream, final String extension) throws IOException {
        final Path tempFile = Files.createTempFile("oss-extractor", extension);
        try (final OutputStream out = Files.newOutputStream(tempFile);
                final BufferedOutputStream bOut = new BufferedOutputStream(out);) {
            IOUtils.copy(inputStream, bOut);
            bOut.close();
            return tempFile;
        }
    }

    protected void extractField(final ParserFieldsBuilder document, final ParserField source, final int maxLength,
            final StringBuilder sb) {
        if (sb.length() >= maxLength)
            return;
        if (document == null || document.fields == null)
            return;
        final Object value = document.fields.get(source.name);
        if (value == null)
            return;
        if (value instanceof List) {
            for (Object object : (List) value) {
                if (object == null)
                    continue;
                sb.append(object.toString());
                sb.append(' ');
                if (sb.length() >= maxLength)
                    return;
            }
        } else
            sb.append(value.toString());
    }

    /**
     * Submit the content of a field to language detection. It checks all the
     * document.
     *
     * @param resultBuilder the documents to check
     * @param source        The field to submit
     * @param maxLength     The maximum number of characters
     * @return the detected language
     */
    protected final String languageDetection(final ParserResultBuilder resultBuilder, final ParserField source,
            final int maxLength) {
        final StringBuilder sb = new StringBuilder();
        resultBuilder.documentsBuilders.forEach(doc -> extractField(doc, source, maxLength, sb));
        return Language.quietDetect(sb.toString(), maxLength);
    }

    /**
     * Submit the content if of a field to language detection.
     *
     * @param document  the document to check
     * @param source    the field containing the text to match
     * @param maxLength the maximum number of characters to test
     * @return the detected language
     */
    protected final String languageDetection(final ParserFieldsBuilder document, final ParserField source,
            final int maxLength) {
        final StringBuilder sb = new StringBuilder();
        extractField(document, source, maxLength, sb);
        return Language.quietDetect(sb.toString(), maxLength);
    }

}