Java tutorial
package lucene; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.StringReader; import java.nio.charset.StandardCharsets; import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.SimpleFileVisitor; import java.nio.file.attribute.BasicFileAttributes; import java.util.ArrayList; import java.util.Date; import java.util.List; import org.apache.commons.lang3.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.en.EnglishMinimalStemmer; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.standard.ClassicFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.similarities.AfterEffectL; import org.apache.lucene.search.similarities.BasicModelIn; import org.apache.lucene.search.similarities.DFRSimilarity; import org.apache.lucene.search.similarities.DistributionLL; import org.apache.lucene.search.similarities.IBSimilarity; import org.apache.lucene.search.similarities.LMDirichletSimilarity; import org.apache.lucene.search.similarities.LambdaDF; import org.apache.lucene.search.similarities.NormalizationH1; import org.apache.lucene.search.similarities.NormalizationH2; import org.apache.lucene.search.similarities.NormalizationH3; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ import org.jsoup.Jsoup; import org.jsoup.nodes.Document; public class IndexFiles { static int n = 0; private IndexFiles() { } int counter = 0; /** Index all text files under a directory. */ public static void main(String[] args) { String usage = "java org.apache.lucene.demo.IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = "E:/index26"; // is DFR //2 is normal //3 is ib with h3 //4 is ib with h2 with porter stemmer //5 is ib with h2 with s stemmer //6 is ib with h2 without stemmer //7 is without stemmer without <p //8 is basic with all tags //9 is ib with h2 and stopwords without stemmer //10 like without ib, lower tags //11 like 10 with lower tags p, br and hr //12 like 11 with tags closed //13 is closed tags with punctuation replace and whitespace tokenizer with hyphen cared for //14 std tokenizer with hyphen taken cared for with stemmer //15 like 14 without stemming //16 like 15 with LMD //17 like 11 with LMD //18 with count of lower and upper delimiters of split //19 like 18 with (?i) to ignore case in all and valipas > 9 //20 with (?i) in all //21 is fresh 19 //22 is legalspans with LMD //23 is fresh 19 without 0 pass //24 is legalspans with InB2 //25 is 23 //26 is 25 with s stemmer and 0 //27 is legalspans demo of 50 passages //28 is modified legal span and fast //29 is 28 with s-stemming //30 is 28 with porter stemming String docsPath = "E:/documents/text"; boolean create = true; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final Path docDir = Paths.get(docsPath); if (!Files.isReadable(docDir)) { System.out.println("Document directory dhfndk '" + docDir.toAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(Paths.get(indexPath)); //Analyzer analyzer = new StandardAnalyzer(); //IndexWriterConfig iwc = new IndexWriterConfig(analyzer); StandardAnalyzer analyzer = new StandardAnalyzer(); //Directory dir = new RAMDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); /*IBSimilarity similarity = new IBSimilarity( new DistributionLL(),//1 //new DistributionSPL(),//2 new LambdaDF(),//1 //new LambdaTTF(), //2 new NormalizationH2());*/ /*DFRSimilarity similarity = new DFRSimilarity( ///////INB2 Similarity new BasicModelIn(), new AfterEffectL(), new NormalizationH1());*/ LMDirichletSimilarity similarity = new LMDirichletSimilarity();//////// LMD Model iwc.setSimilarity(similarity); IndexWriter writer = new IndexWriter(dir, iwc); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } System.out.println("Test 1"); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); //IndexWriter writer = new IndexWriter(dir, iwc); System.out.println("Test 2"); indexDocs(writer, docDir); System.out.println("Test 3"); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } } /** 135 * Indexes the given file using the given writer, or if a directory is given, 136 * recurses over files and directories found under the given directory. 137 * 138 * NOTE: This method indexes one document per input file. This is slow. For good 139 * throughput, put multiple documents into your input file(s). An example of this is 140 * in the benchmark module, which can create "line doc" files, one document per line, 141 * using the 142 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" 143 * >WriteLineDocTask</a>. 144 * 145 * @param writer Writer to the index where the given file/dir info will be stored 146 * @param path The file to index, or the directory to recurse into to find files to index 147 * @throws IOException If there is a low-level I/O error 148 */ static void indexDocs(final IndexWriter writer, Path path) throws IOException { System.out.println("Test 2.1"); if (Files.isDirectory(path)) { System.out.println("Test 2.2"); Files.walkFileTree(path, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { try { System.out.println("Test 2.3"); System.out.println(file.toString()); indexDoc(writer, file, attrs.lastModifiedTime().toMillis()); System.out.println("Test 2.4"); } catch (IOException ignore) { // don't index files that can't be read. } return FileVisitResult.CONTINUE; } }); } else { indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis()); } } /** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document System.out.println("Test 3.1"); BufferedReader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); String line = null; StringBuilder stringBuilder = new StringBuilder(); String ls = System.getProperty("line.separator"); try { while ((line = reader.readLine()) != null) { stringBuilder.append(line); stringBuilder.append(ls); } } finally { reader.close(); } //index file name Field fileNameField = new StringField("name", file.getFileName().toString(), Field.Store.YES); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); // Add the last modified date of the file a field named "modified". // Use a LongPoint that is indexed (i.e. efficiently filterable with // PointRangeQuery). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. String file_content = stringBuilder.toString(); //System.out.println(file_content); //String[] passages = file_content.split("<P|<p"); //String[] passages = file_content.split("<P"); //String[] passages = file_content.split("<P>|<H1>|<H2>|<H3>|<H4>|<H5>|<H6>|<BR>|<HR>|<TABLE>|<TD>|<TH>|<TR>|<OL>|<UL>|<p>|<br>|<hr>");//|<p|<h1|<h2|<h3|<h4|<h5|<h6|<br|<hr|<table|<td|<th|<tr|<ol|<ul"); String[] passages = file_content.split( "(?i)<P|(?i)<H1|(?i)<H2|(?i)<H3|(?i)<H4|(?i)<H5|(?i)<H6|(?i)<BR|(?i)<HR|(?i)<TABLE|(?i)<TD|(?i)<TH|(?i)<TR|(?i)<OL|(?i)<UL");//|<p|<h1|<h2|<h3|<h4|<h5|<h6|<br|<hr|<table|<td|<th|<tr|<ol|<ul"); //String[] passages = StringUtils.substringsBetween(file_content, "<P", "<P"); //String[] title = StringUtils.substringsBetween(file_content, "<body>", "</"); //System.out.println("path"); //String title = passages[0]; String title; Document dochtml;// = Jsoup.parse(title); String ptitle = ""; //= dochtml.body().text(); //System.out.println("Title is" + ptitle); //Field titleField = new StringField("title", ptitle, Field.Store.YES); ///////------FORMATING TEXT--------- StandardTokenizer stdToken = new StandardTokenizer(); //Tokenizer stdToken = new WhitespaceTokenizer(); EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer(); //stdToken.setReader(new StringReader("Some stuff that is in need of analysis. stuff patients PATIENT d > 0.5 Dnn>Bnn D.N.A diseases heart attacks at cl-fo")); //You're code starts here final List<String> stopWords = new ArrayList<>(); String f = "E:/stopwords_en.txt"; try (BufferedReader br = new BufferedReader(new FileReader(f))) { String topic; //int qid = 200;//cntr=0; while ((topic = br.readLine()) != null) { stopWords.add(topic.trim()); } } final CharArraySet stopSet = new CharArraySet(stopWords, false); //////------FORMATING TEXT--------- if (passages != null) { int j = 0; if (passages.length > 1) { title = passages[1].split("</P|</H1|</H2|</H3|</H4|</H5|</H6|</p")[0]; dochtml = Jsoup.parse(title); ptitle = dochtml.body().text().toLowerCase(); System.out.println("Title is" + ptitle); } for (int i = 0; i < passages.length; i++) { //System.out.println(i); //cnames = cname.split(":"); //cname = cnames[0]; String[] passage_contents = passages[i].split("</P|</p"); //String[] passage_contents = passages[i].split("</P"); String passage_content = passage_contents[0]; //if(passage_content.trim().isEmpty()){ // System.out.println("abc"); //continue; //} dochtml = Jsoup.parse(passage_content); String plainStr = dochtml.body().text(); String[] validpas = plainStr.split(" "); if (validpas.length > 9) { j++; Field passageId = new StringField("id", file.getFileName().toString() + "." + i, Field.Store.YES); org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); doc.add(fileNameField); doc.add(pathField); doc.add(passageId); //doc.add(titleField); doc.add(new StringField("offset", file_content.indexOf(passage_content) + "", Field.Store.YES)); doc.add(new StringField("length", passage_content.length() + "", Field.Store.YES)); doc.add(new LongPoint("modified", lastModified)); ((org.apache.lucene.document.Document) doc).add(new TextField("title", ptitle, Store.YES)); //System.out.println(passage_content); //InputStream is = new ByteArrayInputStream(passage_content.getBytes()); //String strippedText = passage_content.replaceAll("(?s)<[^>]*>(\\s*<[^>]*>)*", " "); //--------TEXT PROCESSING------------ TokenStream tokenStream; //String nplainstr = plainStr.replaceAll("-", ".zz"); //stdToken.setReader(new StringReader(nplainstr)); stdToken.setReader(new StringReader(plainStr)); tokenStream = new StopFilter( new ASCIIFoldingFilter(new ClassicFilter(new LowerCaseFilter(stdToken))), stopSet); //tokenStream = new PorterStemFilter(tokenStream); tokenStream.reset(); //int l=0; String term = ""; StringBuilder sb = new StringBuilder(); //OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttr = tokenStream.getAttribute(CharTermAttribute.class); try { //int l; while (tokenStream.incrementToken()) { if (sb.length() > 0) { sb.append(" "); } term = charTermAttr.toString(); /*if(term.contains(".zz")){ term = term.replaceAll(".zz", "-"); String[] terms=term.split("-"); String at=""; for(String t : terms){ //l = stemmer.stem(t.toCharArray(), t.length()); //t = t.substring(0, l); //sb.append(t.toString(),0,l); sb.append(t + " "); at = at+t; } sb.append(at + " "); }*/ if (term.contains(".") && !term.matches(".*\\d+.*")) {//&& StringUtils.isAlpha(term)){ term = term.replaceAll("\\.", ""); //sb.append(term); } //int l = stemmer.stem(charTermAttr.toString().toCharArray(), charTermAttr.toString().length()); int l; l = stemmer.stem(term.toCharArray(), term.length()); //sb.append(charTermAttr.toString(),0,l); sb.append(term, 0, l); //sb.append(term); /*if(term.contains("-")){ String[] terms=term.split("-"); String at=""; for(String t : terms){ sb.append(" " + t); at = at+t; } sb.append(" " + at); }*/ /*sb.append(charTermAttr.toString()); String[] hl = charTermAttr.toString().split("-"); if (hl.length > 1){ for(int j=0; j<hl.length; j++){ sb.append(" " + hl[j]); } //sb.append(" " + charTermAttr.toString().split("-")[1]); //sb.append(charTermAttr.toString()); }*/ } } catch (IOException e) { System.out.println(e.getMessage()); } //System.out.println(sb.toString()); tokenStream.close(); ///----------END OF TExt processin---------- ((org.apache.lucene.document.Document) doc) .add(new TextField("contents", sb.toString(), Store.YES));//new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)))); //doc.add(new StringField("contents", passage_content, Field.Store.YES)); //System.out.println(plainStr); //writer.addDocument(doc); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { n++; // New index, so we just add the document (no old document can be there): System.out.println( ".......adding " + file.getFileName().toString() + " passage " + j + "--" + n); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } } } } } }