de.u808.simpleinquest.indexer.impl.DefaultIndexer.java Source code

Introduction

Here is the source code for de.u808.simpleinquest.indexer.impl.DefaultIndexer.java
Source

/*
 * Copyright 2008-2009 Andreas Friedel
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.u808.simpleinquest.indexer.impl;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.StringReader;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;

import de.u808.common.MD5Util;
import de.u808.simpleinquest.indexer.Indexer;

public class DefaultIndexer implements Indexer {

    private static Log log = LogFactory.getLog(DefaultIndexer.class);

    public Document indexFile(File file) throws FileNotFoundException, IndexerException {
        Document document = null;
        if (file.canRead()) {
            try {
                document = new Document();
                InputStream input = new FileInputStream(file);

                Metadata metadata = new Metadata();
                ContentHandler handler = new BodyContentHandler();
                new AutoDetectParser().parse(input, handler, metadata);

                document.add(new Field(Indexer.PATH_FIELD_NAME, file.getPath(), Field.Store.YES,
                        Field.Index.UN_TOKENIZED));
                document.add(new Field(Indexer.ID_FIELD_NAME, MD5Util.getInstance().getMD5Hex(file.getPath()),
                        Field.Store.YES, Field.Index.UN_TOKENIZED));

                document.add(new Field(Indexer.MODIFIED_FIELD_NAME,
                        DateTools.timeToString(file.lastModified(), DateTools.Resolution.MINUTE), Field.Store.YES,
                        Field.Index.UN_TOKENIZED));

                document.add(new Field(Indexer.CONTENT_FIELD_NAME, new StringReader(handler.toString())));
            } catch (Exception e) {
                if (e instanceof TikaException) {
                    Throwable t = e.getCause();
                    if (t != null && t.getMessage() != null) {
                        if (t.getMessage().startsWith("Error decrypting document")) {
                            log.debug("Cant index encrypted document.");
                            return document;
                        }
                    }
                }
                throw new IndexerException(e.getMessage(), e);
            }
        } else {
            log.debug("Cant read file: " + file.getName());
        }
        return document;
    }

}