org.talend.dataquality.standardization.index.test.MyTest.java Source code

Java tutorial

Introduction

Here is the source code for org.talend.dataquality.standardization.index.test.MyTest.java

Source

// ============================================================================
//
// Copyright (C) 2006-2015 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.standardization.index.test;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

import junit.framework.TestCase;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

/**
 * DOC scorreia class global comment. Detailled comment
 */
public class MyTest extends TestCase {

    private static final String[] input = { "WING ASSEMBY, USE 5J868-A HEX BOLT .25? - DRILL FOUR HOLES",
            "WING ASSY DRILL 4 HOLE USE 5J868A HEXBOLT 1/4 INCH",
            "WING ASSY DRILL 4 HOLE USE 5J868A HEXBOLT 1/4 INCH",
            "WING ASSY DRILL 4 HOLE USE 5J868A HEXBOLT 1/4 INCH",
            "WING ASSY DRILL 4 HOLE USE 5J868A HEXBOLT 1/4 INCH",
            "WING ASSY DRILL 4 HOLE USE 5J868A HEXBOLT 1/4 INCH",
            "WING ASSY DRILL 4 HOLE USE 5J868A HEXBOLT 1/4 INCH",
            "WING ASSY DRILL 4 HOLE USE 5J868A HEXBOLT 1/4 INCH",
            "WING ASSY DRILL 4 HOLE USE 5J868A HEXBOLT 1/4 INCH",
            "WING ASSY DRILL 4 HOLE USE 5J868A HEXBOLT 1/4 INCH",
            "WING ASSY DRILL 4 HOLE USE 5J868A HEXBOLT 1/4 INCH",
            "WING ASSY DRILL 4 HOLE USE 5J868A HEXBOLT 1/4 INCH",
            "WING ASSY DRILL 4 HOLE USE 5J868A HEXBOLT 1/4 INCH",
            "WING ASSEMBY, USE 5J868-A HEX BOLT .25? - DRILL FOUR HOLES",
            "USE 4 5J868A BOLTS (HEX .25) - DRILL HOLES FOR EA ON WING ASSEM" };

    public void testRun() {
        Directory index;
        try {
            index = new RAMDirectory();
            // The same analyzer should be used for indexing and searching
            Analyzer analyzer = SynonymTest.createAnalyzer();
            // Analyzer analyzer = new StandardAnalyzer();
            // the boolean arg in the IndexWriter ctor means to
            // create a new index, overwriting any existing index
            IndexWriter w = new IndexWriter(index, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
            // read the data (this will be the input data of a component called
            // tFirstNameStandardize)

            for (String data : input) {
                Document doc = createDoc(data);
                w.addDocument(doc);
            }

            w.close();

            // now search
            IndexSearcher is = new IndexSearcher(index);
            Set<String> set = new HashSet<String>(Arrays.asList(input));

            for (String data : set) {

                // Term termName = new Term("steph");
                QueryParser qp = new QueryParser(Version.LUCENE_30, "FIELD_NAME", analyzer);
                Query q = qp.parse(data);

                TopDocsCollector<?> collector = TopScoreDocCollector.create(2, false);
                is.search(q, collector);

                ScoreDoc[] scoreDocs = collector.topDocs().scoreDocs;
                System.out.println("\nnb doc= " + scoreDocs.length);
                System.out.print("input: " + data);
                for (ScoreDoc scoreDoc : scoreDocs) {
                    String matchedData = is.doc(scoreDoc.doc).get("FIELD_NAME");
                    System.out.println("  matches  " + matchedData);
                }

            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (ParseException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

    /**
     * DOC scorreia Comment method "createDoc".
     * 
     * @param data
     * @return
     */
    private Document createDoc(String data) {
        Document doc = new Document();
        Field field = new Field("FIELD_NAME", data, Field.Store.YES, Field.Index.ANALYZED, TermVector.YES);
        doc.add(field);
        return doc;
    }

}