Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package net.skyatlas.icd.test; import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.Collection; import java.util.Date; import java.util.HashSet; import java.util.List; import java.util.ResourceBundle; import love.cq.util.IOUtil; import org.ansj.app.keyword.KeyWordComputer; import org.ansj.app.keyword.Keyword; import org.ansj.domain.Term; import org.ansj.library.UserDefineLibrary; import org.ansj.lucene4.AnsjAnalysis; import org.ansj.recognition.NatureRecognition; import org.ansj.splitWord.analysis.NlpAnalysis; import org.ansj.splitWord.analysis.ToAnalysis; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; /** * * @author changzhenghe */ public class AnsegTest { static public void main(String[] args) throws IOException, CorruptIndexException, ParseException, InvalidTokenOffsetsException { AnsegTest inst = new AnsegTest(); Token nt = new Token(); Analyzer ca = new AnsjAnalysis(); Reader sentence = new StringReader( "\n\n\n\n\n\n\n????, ????????????????????????????" + "???????????????????" + "??????????? ??????????????2????" + "" + "? ????????????? ??? ????????"); TokenStream ts = ca.tokenStream("sentence", sentence); System.out.println("start: " + (new Date())); long before = System.currentTimeMillis(); while (ts.incrementToken()) { System.out.println(ts.getAttribute(CharTermAttribute.class)); } ts.close(); long now = System.currentTimeMillis(); System.out.println("time: " + (now - before) / 1000.0 + " s"); HashSet<String> hs = new HashSet<String>(); BufferedReader reader2 = IOUtil.getReader(ResourceBundle.getBundle("library").getString("stopLibrary"), "UTF-8"); String word = null; while ((word = reader2.readLine()) != null) { hs.add(word); } Analyzer analyzer = new AnsjAnalysis(hs, false); Directory directory = null; IndexWriter iwriter = null; BufferedReader reader = IOUtil.getReader("/Users/changzhenghe/Downloads/hy_statspack01.txt", "UTF-8"); String temp = null; StringBuilder sb = new StringBuilder(); while ((temp = reader.readLine()) != null) { sb.append(temp); sb.append("\n"); } reader.close(); String text = sb.toString(); text = "???????????? ??? ????????"; IndexWriterConfig ic = new IndexWriterConfig(Version.LUCENE_32, analyzer); // directory = new RAMDirectory(); iwriter = new IndexWriter(directory, ic); // BufferedReader reader = // IOUtil.getReader("/Users/ansj/Documents//?//1998?_.txt", // "GBK"); // String temp = null; // while ((temp = reader.readLine()) != null) { // addContent(iwriter, temp); // } inst.addContent(iwriter, "? ?() (?)"); inst.addContent(iwriter, " ?() (?)"); inst.addContent(iwriter, "? ? (?)"); inst.addContent(iwriter, " ??NEC "); inst.addContent(iwriter, "?"); iwriter.commit(); iwriter.close(); System.out.println(""); inst.search(analyzer, directory, ""); inst.search(analyzer, directory, ""); inst.search(analyzer, directory, ""); inst.search(analyzer, directory, "?"); /* KeyWordComputer kwc = new KeyWordComputer(5); String title = "??"; String content = "9??" + "?????????" + "????" + "??" + "?????" + "???" + "??????" + "???" + "????20??" + "????" + "?" + "???]??" + "???"; Collection<Keyword> result = kwc.computeArticleTfidf(title, content); System.out.println(result); AnsegTest t = new AnsegTest(); List<Term> parse = ToAnalysis.parse("?"); System.out.println(parse); System.out.println("*********** ? ************"); // UserDefineLibrary.insertWord("", "userDefine", 1000); // UserDefineLibrary.insertWord("?", "userDefine", 1000); UserDefineLibrary.insertWord("?", "userDefine", 1000); parse = ToAnalysis.parse("???"); System.out.println(parse); */ } private Analyzer ansjHeightAnalyzer = new AnsjAnalysis(); private void search(Analyzer analyzer, Directory directory, String queryStr) throws CorruptIndexException, IOException, ParseException, InvalidTokenOffsetsException { IndexSearcher isearcher; IndexReader reader = DirectoryReader.open(directory); // isearcher = new IndexSearcher(reader); QueryParser tq = new QueryParser(Version.LUCENE_32, "text", ansjHeightAnalyzer); Query query = tq.parse(queryStr); System.out.println(query); TopDocs hits = isearcher.search(query, 5); System.out.println(queryStr + ":" + hits.totalHits + "?!"); for (int i = 0; i < hits.scoreDocs.length; i++) { int docId = hits.scoreDocs[i].doc; Document document = isearcher.doc(docId); System.out.println(toHighlighter(ansjHeightAnalyzer, query, document)); } } private String toHighlighter(Analyzer analyzer, Query query, Document doc) throws InvalidTokenOffsetsException { String field = "text"; try { SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>"); Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query)); TokenStream tokenStream1 = analyzer.tokenStream("text", new StringReader(doc.get(field))); String highlighterStr = highlighter.getBestFragment(tokenStream1, doc.get(field)); return highlighterStr == null ? doc.get(field) : highlighterStr; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InvalidTokenOffsetsException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } private void addContent(IndexWriter iwriter, String text) throws CorruptIndexException, IOException { Document doc = new Document(); doc.add(new Field("text", text, Field.Store.YES, Field.Index.ANALYZED)); iwriter.addDocument(doc); } }