org.nlp4l.lucene.TermsExtractor.java Source code

Introduction

Here is the source code for org.nlp4l.lucene.TermsExtractor.java
Source

/*
 * Copyright 2016 org.NLP4L
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.nlp4l.lucene;

import java.io.IOException;
import java.io.PrintWriter;
import java.lang.reflect.Constructor;
import java.nio.file.FileSystems;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;

/**
 * Lucene??
 * <br/><br/>
 * ?
<table border="1" cellpadding="3" cellspacing="0">
<tr class="TableHeadingColor">
<th>??</th><th></th><th>?</th>
</tr>
<tr>
<td>nlp4l.terms.extractor.lucene.index</td><td>Lucene??</td><td></td>
</tr>
<tr>
<td>nlp4l.terms.extractor.lucene.field.cn</td><td>???????Lucene??</td><td></td>
</tr>
<tr>
<td>nlp4l.terms.extractor.lucene.field.ln2</td><td>????????Lucene??</td><td>${nlp4l.terms.extractor.lucene.field.cn}_ln2</td>
</tr>
<tr>
<td>nlp4l.terms.extractor.lucene.field.rn2</td><td>?????????Lucene??</td><td>${nlp4l.terms.extractor.lucene.field.cn}_rn2</td>
</tr>
<tr>
<td>nlp4l.terms.extractor.lucene.delimiter</td><td>????</td><td>"/"</td>
</tr>
<tr>
<td>nlp4l.terms.extractor.out.file</td><td>?????</td><td></td>
</tr>
<tr>
<td>nlp4l.terms.extractor.out.score</td><td>?????????</td><td>true</td>
</tr>
<tr>
<td>nlp4l.terms.extractor.out.num</td><td>?</td><td>1000</td>
</tr>
<tr>
<td>nlp4l.terms.extractor.verbose</td><td>verbose</td><td>false</td>
</tr>
</table>
 * 
 * @since 0.3
 */
public class TermsExtractor {

    public static final String DEF_DELIMITER = "/";
    public static final int DEF_OUT_NUM = 1000;

    public static class Config {
        public String scorer;
        public String index;
        public String fieldCn;
        public String fieldLn2;
        public String fieldRn2;
        public String delimiter;
        public String outFile;
        public boolean outScore = true;
        public int outNum = DEF_OUT_NUM;

        String getScorer() {
            if (scorer == null)
                throw new IllegalArgumentException("scorer must be specified");
            else if (scorer.equals("FreqDFLR"))
                return "org.nlp4l.lucene.ConcatFreqDFLRCompoundNounScorer";
            else if (scorer.equals("FreqLR"))
                return "org.nlp4l.lucene.ConcatFreqLRCompoundNounScorer";
            else if (scorer.equals("TypeCountDFLR"))
                return "org.nlp4l.lucene.ConcatTypeCountDFLRCompoundNounScorer";
            else if (scorer.equals("TypeCountLR"))
                return "org.nlp4l.lucene.ConcatTypeCountLRCompoundNounScorer";
            else
                throw new IllegalArgumentException(String.format("invalid scorer is specified (%s)", scorer));
        }

        String getIndex() {
            if (index == null)
                throw new IllegalArgumentException("index must be specified");
            return index;
        }

        String getFieldNameCn() {
            if (fieldCn == null)
                throw new IllegalArgumentException("fieldCn must be specified");
            return fieldCn;
        }

        String getFieldNameLn2() {
            if (fieldLn2 != null)
                return fieldLn2;
            else if (fieldCn != null)
                return fieldCn + "_ln2";
            throw new IllegalArgumentException("fieldLn2 must be specified");
        }

        String getFieldNameRn2() {
            if (fieldRn2 != null)
                return fieldRn2;
            else if (fieldCn != null)
                return fieldCn + "_rn2";
            throw new IllegalArgumentException("fieldRn2 must be specified");
        }

        String getDelimiter() {
            return delimiter == null ? DEF_DELIMITER : delimiter;
        }

        String getOutFile() {
            return outFile;
        }

        boolean getOutScore() {
            return outScore;
        }

        int getOutNum() {
            return outNum;
        }
    }

    Config config;

    private CompoundNounScorer scorer;
    private IndexReader reader;
    private String fieldNameCn, fieldNameLn2, fieldNameRn2, delimiter, outFile;
    private PrintWriter pw;
    private boolean outScore;
    private int outNum;

    private static final Pattern P_SPACES = Pattern.compile("[\\s\\u3000\\u00a0]+");

    public static TermsExtractor getExtractor(Config config) {
        TermsExtractor te = new TermsExtractor(config);
        te.setConfig();
        return te;
    }

    protected TermsExtractor(Config config) {
        this.config = config;
    }

    public void setConfig() {
        fieldNameCn = config.getFieldNameCn();
        fieldNameLn2 = config.getFieldNameLn2();
        fieldNameRn2 = config.getFieldNameRn2();
        delimiter = config.getDelimiter();
        outFile = config.getOutFile();
        outScore = config.getOutScore();
        outNum = config.getOutNum();
    }

    void init() throws IOException {
        Directory dir = FSDirectory.open(FileSystems.getDefault().getPath(config.getIndex()));
        reader = DirectoryReader.open(dir);
        pw = outFile == null ? new PrintWriter(System.out) : new PrintWriter(outFile, "UTF-8");

        try {
            // load CompoundNounScorer class
            Class<?> aClass = Class.forName(config.getScorer());
            Constructor<?> aConstr = aClass.getConstructor(IndexReader.class, String.class, String.class,
                    String.class, String.class);
            scorer = (CompoundNounScorer) aConstr.newInstance(reader, delimiter, fieldNameCn, fieldNameLn2,
                    fieldNameRn2);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public void execute() throws IOException {
        try {
            init();
            Terms terms = MultiFields.getTerms(reader, fieldNameCn);
            TermsEnum te = terms.iterator();
            BytesRef text = null;
            LuceneDocTermVector.TermWeightQueue queue = new LuceneDocTermVector.TermWeightQueue(outNum);

            int count = 0;
            while ((text = te.next()) != null) {
                /*
                        if(count % 5000 == 0){
                          logger.printTime(count);
                        }
                */
                final String term = text.utf8ToString();
                // http://rondhuit-dev.com/trac/projects/ticket/184
                if (P_SPACES.matcher(term).find())
                    continue;
                final LuceneDocTermVector.TermWeight termWeight = new TermScore((float) scorer.score(term));

                Map.Entry<String, LuceneDocTermVector.TermWeight> entry = new Map.Entry<String, LuceneDocTermVector.TermWeight>() {
                    public String getKey() {
                        return term;
                    }

                    public LuceneDocTermVector.TermWeight getValue() {
                        return termWeight;
                    }

                    public LuceneDocTermVector.TermWeight setValue(LuceneDocTermVector.TermWeight arg0) {
                        // TODO Auto-generated method stub
                        return null;
                    }
                };
                queue.insertWithOverflow(entry);
                count++;
            }
            //logger.log("number of compound nouns is %d\n", count);
            printQueue(queue);
        } finally {
            try {
                if (reader != null)
                    reader.close();
            } catch (IOException e) {
            }
            IOUtils.closeQuietly(pw);
        }
    }

    void printQueue(LuceneDocTermVector.TermWeightQueue queue) {
        List<Map.Entry<String, LuceneDocTermVector.TermWeight>> list = new ArrayList<Entry<String, LuceneDocTermVector.TermWeight>>(
                queue.size());
        Map.Entry<String, LuceneDocTermVector.TermWeight> entry = null;
        while ((entry = queue.pop()) != null) {
            list.add(entry);
        }

        for (int i = list.size() - 1; i >= 0; i--) {
            Map.Entry<String, LuceneDocTermVector.TermWeight> e = list.get(i);
            printResultEntry(e);
        }
    }

    protected void printResultEntry(Map.Entry<String, LuceneDocTermVector.TermWeight> e) {
        if (outScore)
            pw.printf("%s, %f\n", getTerm(e), getScore(e));
        else
            pw.printf("%s\n", getTerm(e));
    }

    protected String getTerm(Map.Entry<String, LuceneDocTermVector.TermWeight> e) {
        return e.getKey().replace(delimiter, "");
    }

    protected float getScore(Map.Entry<String, LuceneDocTermVector.TermWeight> e) {
        return e.getValue().weight();
    }

    /**
     * ????????????{@link org.nlp4l.lucene.LuceneDocTermVector.TermWeight}
     *
     * @since 0.3
     */
    public static class TermScore implements LuceneDocTermVector.TermWeight {

        private float score;

        public TermScore(float score) {
            this.score = score;
        }

        public float weight() {
            return score;
        }
    }

    /**
     * ?????
     * @since 0.3
     */
    public static abstract class CompoundNounScorer {

        protected final IndexReader reader;
        protected final String delimiter, fieldNameCn, fieldNameLn2, fieldNameRn2;

        public CompoundNounScorer(IndexReader reader, String delimiter, String fieldNameCn, String fieldNameLn2,
                String fieldNameRn2) {
            this.reader = reader;
            this.delimiter = delimiter;
            this.fieldNameCn = fieldNameCn;
            this.fieldNameLn2 = fieldNameLn2;
            this.fieldNameRn2 = fieldNameRn2;
        }

        public abstract double score(String compNoun) throws IOException;
    }
}