edu.cuhk.hccl.Indexer.java Source code

Java tutorial

Introduction

Here is the source code for edu.cuhk.hccl.Indexer.java

Source

/**
 * Copyright (C) 2014 Pengfei Liu <pfliu@se.cuhk.edu.hk>
 * The Chinese University of Hong Kong.
 *
 * This file is part of smart-search-web.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.cuhk.hccl;

import java.io.File;
import java.io.IOException;
import java.util.Collection;

import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

public class Indexer {

    public final static String CONTENT_FIELD = "content";
    public final static String PATH_FIELD = "path";

    public static final FieldType TERM_STORED = new FieldType();

    public static StandardAnalyzer analyzer = new StandardAnalyzer();
    public static Directory index = new RAMDirectory();

    static {
        TERM_STORED.setIndexed(true);
        TERM_STORED.setTokenized(true);
        TERM_STORED.setStored(true);
        TERM_STORED.setStoreTermVectors(true);
        TERM_STORED.setStoreTermVectorPositions(true);
        TERM_STORED.freeze();
    }

    /**
     * Create index of RAMDirectory from a data folder with text files
     * 
     * @param dataSet
     * @throws IOException
     */
    public static void createIndex(String dataSet) throws IOException {
        IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer);
        IndexWriter writer = new IndexWriter(index, config);

        Collection<File> files = FileUtils.listFiles(new File(dataSet), null, true);
        for (File file : files) {
            String path = file.getPath();
            String content = FileUtils.readFileToString(file);

            Document doc = new Document();

            doc.add(new StringField(PATH_FIELD, path, Field.Store.YES));
            doc.add(new Field(CONTENT_FIELD, content, TERM_STORED));

            writer.addDocument(doc);

            System.out.println("[INFO] Indexing file: " + path);
        }

        System.out.println("\n[INFO]" + files.size() + " files has been indexed.");

        writer.close();
    }

}