com.github.rnewson.couchdb.lucene.Tika.java Source code

Java tutorial

Introduction

Here is the source code for com.github.rnewson.couchdb.lucene.Tika.java

Source

package com.github.rnewson.couchdb.lucene;

/**
 * Copyright 2009 Robert Newson
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); 
 * you may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import static com.github.rnewson.couchdb.lucene.Utils.text;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;

import org.apache.commons.io.IOUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.nutch.analysis.lang.LanguageIdentifier;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParsingReader;

public final class Tika {

    private static final Logger log = LogManager.getLogger(Tika.class);

    private static final String DC = "_dc.";

    public void parse(final InputStream in, final String contentType, final String fieldName, final Document doc)
            throws IOException {
        final AutoDetectParser parser = new AutoDetectParser();
        final Metadata md = new Metadata();
        md.set(Metadata.CONTENT_TYPE, contentType);

        final Reader reader = new ParsingReader(parser, in, md);
        final String body;
        try {
            try {
                body = IOUtils.toString(reader);
            } finally {
                reader.close();
            }
        } catch (final IOException e) {
            log.warn("Failed to index an attachment.", e);
            return;
        }

        // Add body text.
        doc.add(text(fieldName, body, false));

        // Add DC attributes.
        addDublinCoreAttributes(md, doc);

        // Detect language.
        final String language = LanguageIdentifier.identifyLanguage(body);
        if (language != null && language.length() > 0)
            doc.add(text(DC + DublinCore.LANGUAGE, language, false));
    }

    private void addDublinCoreAttributes(final Metadata md, final Document doc) {
        addAttribute(DC, DublinCore.CONTRIBUTOR, md, doc);
        addAttribute(DC, DublinCore.COVERAGE, md, doc);
        addAttribute(DC, DublinCore.CREATOR, md, doc);
        addAttribute(DC, DublinCore.DATE, md, doc);
        addAttribute(DC, DublinCore.DESCRIPTION, md, doc);
        addAttribute(DC, DublinCore.FORMAT, md, doc);
        addAttribute(DC, DublinCore.IDENTIFIER, md, doc);
        addAttribute(DC, DublinCore.LANGUAGE, md, doc);
        addAttribute(DC, DublinCore.MODIFIED, md, doc);
        addAttribute(DC, DublinCore.PUBLISHER, md, doc);
        addAttribute(DC, DublinCore.RELATION, md, doc);
        addAttribute(DC, DublinCore.RIGHTS, md, doc);
        addAttribute(DC, DublinCore.SOURCE, md, doc);
        addAttribute(DC, DublinCore.SUBJECT, md, doc);
        addAttribute(DC, DublinCore.TITLE, md, doc);
        addAttribute(DC, DublinCore.TYPE, md, doc);
    }

    private void addAttribute(final String namespace, final String attributeName, final Metadata md,
            final Document doc) {
        if (md.get(attributeName) != null) {
            doc.add(text(namespace + attributeName, md.get(attributeName), false));
        }
    }
}