com.github.msarhan.lucene.ArabicRootExtractorAnalyzerTests.java Source code

Introduction

Here is the source code for com.github.msarhan.lucene.ArabicRootExtractorAnalyzerTests.java
Source

/*
 * The MIT License
 *
 * Copyright 2015 Mouaffak A. Sarhan
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package com.github.msarhan.lucene;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Test;

import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.file.Files;
import java.util.concurrent.atomic.AtomicInteger;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;

/**
 * @author Mouaffak A. Sarhan &lt;mouffaksarhan@gmail.com&gt;
 */
public class ArabicRootExtractorAnalyzerTests {

    @Test
    public void testArabicRootIndex() throws IOException, ParseException, URISyntaxException {
        Directory index = new RAMDirectory();
        ArabicRootExtractorAnalyzer analyzer = new ArabicRootExtractorAnalyzer();
        IndexWriterConfig config = new IndexWriterConfig(analyzer);

        final AtomicInteger id = new AtomicInteger(0);
        IndexWriter w = new IndexWriter(index, config);
        URL url = ArabicRootExtractorStemmer.class.getClassLoader()
                .getResource("com/github/msarhan/lucene/fateha.txt");

        if (url == null) {
            fail("Not able to load data file!");
        }

        Files.lines(new File(url.toURI()).toPath())
                .forEach(line -> addDoc(w, line, String.valueOf(id.incrementAndGet())));
        w.close();

        String querystr = "";
        Query q = new QueryParser("title", analyzer).parse(querystr);

        int hitsPerPage = 10;
        IndexReader reader = DirectoryReader.open(index);
        IndexSearcher searcher = new IndexSearcher(reader);
        TopDocs docs = searcher.search(q, hitsPerPage);

        //print(searcher, docs);

        assertEquals(2, docs.scoreDocs.length);
    }

    private void addDoc(IndexWriter w, String title, String number) {
        Document doc = new Document();
        doc.add(new TextField("title", title, Field.Store.YES));
        doc.add(new StringField("number", number, Field.Store.YES));
        try {
            w.addDocument(doc);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private void print(IndexSearcher searcher, TopDocs docs) throws IOException {
        ScoreDoc[] hits = docs.scoreDocs;

        System.out.println("Found " + hits.length + " hits.");
        for (ScoreDoc hit : hits) {
            int docId = hit.doc;
            Document d = searcher.doc(docId);
            System.out.println(d.get("number") + "\t" + d.get("title"));
        }
    }

    @Test
    public void testInlineStemmer() throws IOException, ParseException {

        //Initialize the index
        Directory index = new RAMDirectory();
        Analyzer analyzer = new ArabicRootExtractorAnalyzer();
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        IndexWriter writer = new IndexWriter(index, config);

        Document doc = new Document();
        doc.add(new StringField("number", "1", Field.Store.YES));
        doc.add(new TextField("title", "?? ? ? ??",
                Field.Store.YES));
        writer.addDocument(doc);

        doc = new Document();
        doc.add(new StringField("number", "2", Field.Store.YES));
        doc.add(new TextField("title", "? ?? ? ?",
                Field.Store.YES));
        writer.addDocument(doc);

        doc = new Document();
        doc.add(new StringField("number", "3", Field.Store.YES));
        doc.add(new TextField("title", "? ??", Field.Store.YES));
        writer.addDocument(doc);
        writer.close();
        //~

        //Query the index
        String queryStr = "";
        Query query = new QueryParser("title", analyzer).parse(queryStr);

        int hitsPerPage = 5;
        IndexReader reader = DirectoryReader.open(index);
        IndexSearcher searcher = new IndexSearcher(reader);
        TopDocs docs = searcher.search(query, hitsPerPage, Sort.INDEXORDER);

        ScoreDoc[] hits = docs.scoreDocs;
        //~

        //Print results
        /*
        System.out.println("Found " + hits.length + " hits:");
        for (ScoreDoc hit : hits) {
           int docId = hit.doc;
           Document d = searcher.doc(docId);
           System.out.printf("\t(%s): %s\n", d.get("number"), d.get("title"));
        }
        */
        //~

    }

}