package lucene;

import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.en.EnglishMinimalStemmer;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.standard.ClassicFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class IndexFiles {
    static int n = 0;

    private IndexFiles() {

    int counter = 0;

    /** Index all text files under a directory. */
    public static void main(String[] args) {
        String usage = "java org.apache.lucene.demo.IndexFiles"
                + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
                + "This indexes the documents in DOCS_PATH, creating a Lucene index"
                + "in INDEX_PATH that can be searched with SearchFiles";
        String docsPath = "E:/documents/text";
        boolean create = true;
        for (int i = 0; i < args.length; i++) {
            if ("-index".equals(args[i])) {
                indexPath = args[i + 1];
            } else if ("-docs".equals(args[i])) {
                docsPath = args[i + 1];
            } else if ("-update".equals(args[i])) {
                create = false;

        if (docsPath == null) {
            System.err.println("Usage: " + usage);

        final Path docDir = Paths.get(docsPath);
        if (!Files.isReadable(docDir)) {
            System.out.println("Document directory dhfndk '" + docDir.toAbsolutePath()
                    + "' does not exist or is not readable, please check the path");

        Date start = new Date();
        try {
            System.out.println("Indexing to directory '" + indexPath + "'...");

            Directory dir =;
            //Analyzer analyzer = new StandardAnalyzer();
            //IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

            StandardAnalyzer analyzer = new StandardAnalyzer();
            //Directory dir = new RAMDirectory();
            IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
            /*IBSimilarity similarity = new IBSimilarity(
                  new DistributionLL(),//1 
                  //new DistributionSPL(),//2
                  new LambdaDF(),//1 
                  //new LambdaTTF(), //2
                  new NormalizationH2());*/
            /*DFRSimilarity similarity = new DFRSimilarity( ///////INB2 Similarity
              new BasicModelIn(),
              new AfterEffectL(),
              new NormalizationH1());*/
            LMDirichletSimilarity similarity = new LMDirichletSimilarity();//////// LMD Model
            IndexWriter writer = new IndexWriter(dir, iwc);

            if (create) {
                // Create a new index in the directory, removing any
                // previously indexed documents:
            } else {
                // Add new documents to an existing index:
            System.out.println("Test 1");

            // Optional: for better indexing performance, if you
            // are indexing many documents, increase the RAM
            // buffer.  But if you do this, increase the max heap
            // size to the JVM (eg add -Xmx512m or -Xmx1g):
            // iwc.setRAMBufferSizeMB(256.0);

            //IndexWriter writer = new IndexWriter(dir, iwc);
            System.out.println("Test 2");
            indexDocs(writer, docDir);
            System.out.println("Test 3");

            // NOTE: if you want to maximize search performance,
            // you can optionally call forceMerge here.  This can be
            // a terribly costly operation, so generally it's only
            // worth it when your index is relatively static (ie
            // you're done adding documents to it):
            // writer.forceMerge(1);


            Date end = new Date();
            System.out.println(end.getTime() - start.getTime() + " total milliseconds");

        } catch (IOException e) {
            System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());

    static void indexDocs(final IndexWriter writer, Path path) throws IOException {
        System.out.println("Test 2.1");

        if (Files.isDirectory(path)) {
            System.out.println("Test 2.2");
            Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
                public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                    try {
                        System.out.println("Test 2.3");
                        indexDoc(writer, file, attrs.lastModifiedTime().toMillis());
                        System.out.println("Test 2.4");
                    } catch (IOException ignore) {
                        // don't index files that can't be read.
                    return FileVisitResult.CONTINUE;
        } else {
            indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());

    /** Indexes a single document */
    static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {

        try (InputStream stream = Files.newInputStream(file)) {

            // make a new, empty document
            System.out.println("Test 3.1");
            BufferedReader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
            String line = null;
            StringBuilder stringBuilder = new StringBuilder();
            String ls = System.getProperty("line.separator");

            try {
                while ((line = reader.readLine()) != null) {

            } finally {

            //index file name
            Field fileNameField = new StringField("name", file.getFileName().toString(), Field.Store.YES);

            // Add the path of the file as a field named "path".  Use a
            // field that is indexed (i.e. searchable), but don't tokenize 
            // the field into separate words and don't index term frequency
            // or positional information:
            Field pathField = new StringField("path", file.toString(), Field.Store.YES);

            // Add the last modified date of the file a field named "modified".
            // Use a LongPoint that is indexed (i.e. efficiently filterable with
            // PointRangeQuery).  This indexes to milli-second resolution, which
            // is often too fine.  You could instead create a number based on
            // year/month/day/hour/minutes/seconds, down the resolution you require.
            // For example the long value 2011021714 would mean
            // February 17, 2011, 2-3 PM.

            // Add the contents of the file to a field named "contents".  Specify a Reader,
            // so that the text of the file is tokenized and indexed, but not stored.
            // Note that FileReader expects the file to be in UTF-8 encoding.
            // If that's not the case searching for special characters will fail.

            String file_content = stringBuilder.toString();
            //String[] passages = file_content.split("<P|<p");
            //String[] passages = file_content.split("<P");
            //String[] passages = file_content.split("<P>|<H1>|<H2>|<H3>|<H4>|<H5>|<H6>|<BR>|<HR>|<TABLE>|<TD>|<TH>|<TR>|<OL>|<UL>|<p>|<br>|<hr>");//|<p|<h1|<h2|<h3|<h4|<h5|<h6|<br|<hr|<table|<td|<th|<tr|<ol|<ul");
            String[] passages = file_content.split(

            //String[] passages = StringUtils.substringsBetween(file_content, "<P", "<P");
            //String[] title = StringUtils.substringsBetween(file_content, "<body>", "</");
            //String title = passages[0];
            String title;
            Document dochtml;// = Jsoup.parse(title);
            String ptitle = ""; //= dochtml.body().text();
            //System.out.println("Title is" + ptitle);
            //Field titleField = new StringField("title", ptitle, Field.Store.YES);

            ///////------FORMATING TEXT---------
            StandardTokenizer stdToken = new StandardTokenizer();
            //Tokenizer stdToken = new WhitespaceTokenizer();
            EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer();

            //stdToken.setReader(new StringReader("Some stuff that is in need of analysis. stuff patients PATIENT d > 0.5 Dnn>Bnn D.N.A diseases heart attacks at cl-fo"));

            //You're code starts here
            final List<String> stopWords = new ArrayList<>();
            String f = "E:/stopwords_en.txt";

            try (BufferedReader br = new BufferedReader(new FileReader(f))) {

                String topic;
                //int qid = 200;//cntr=0;
                while ((topic = br.readLine()) != null) {
            final CharArraySet stopSet = new CharArraySet(stopWords, false);

            //////------FORMATING TEXT---------
            if (passages != null) {
                int j = 0;
                if (passages.length > 1) {
                    title = passages[1].split("</P|</H1|</H2|</H3|</H4|</H5|</H6|</p")[0];
                    dochtml = Jsoup.parse(title);
                    ptitle = dochtml.body().text().toLowerCase();
                    System.out.println("Title is" + ptitle);
                for (int i = 0; i < passages.length; i++) {

                    //cnames = cname.split(":");
                    //cname =  cnames[0];
                    String[] passage_contents = passages[i].split("</P|</p");
                    //String[] passage_contents = passages[i].split("</P");
                    String passage_content = passage_contents[0];
                    //  System.out.println("abc");
                    dochtml = Jsoup.parse(passage_content);
                    String plainStr = dochtml.body().text();
                    String[] validpas = plainStr.split(" ");

                    if (validpas.length > 9) {
                        Field passageId = new StringField("id", file.getFileName().toString() + "." + i,

                        org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
                        doc.add(new StringField("offset", file_content.indexOf(passage_content) + "",
                        doc.add(new StringField("length", passage_content.length() + "", Field.Store.YES));
                        doc.add(new LongPoint("modified", lastModified));
                        ((org.apache.lucene.document.Document) doc).add(new TextField("title", ptitle, Store.YES));
                        //InputStream is = new ByteArrayInputStream(passage_content.getBytes());

                        //String strippedText = passage_content.replaceAll("(?s)<[^>]*>(\\s*<[^>]*>)*", " ");

                        //--------TEXT PROCESSING------------
                        TokenStream tokenStream;
                        //String nplainstr = plainStr.replaceAll("-", ".zz");
                        //stdToken.setReader(new StringReader(nplainstr));
                        stdToken.setReader(new StringReader(plainStr));

                        tokenStream = new StopFilter(
                                new ASCIIFoldingFilter(new ClassicFilter(new LowerCaseFilter(stdToken))), stopSet);

                        //tokenStream = new PorterStemFilter(tokenStream);
                        //int l=0;
                        String term = "";
                        StringBuilder sb = new StringBuilder();
                        //OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
                        CharTermAttribute charTermAttr = tokenStream.getAttribute(CharTermAttribute.class);
                        try {
                            //int l;
                            while (tokenStream.incrementToken()) {
                                if (sb.length() > 0) {
                                    sb.append(" ");
                                term = charTermAttr.toString();
                                   term = term.replaceAll(".zz", "-");
                                   String[] terms=term.split("-");
                                   String at="";
                                   for(String t : terms){
                                      //l = stemmer.stem(t.toCharArray(), t.length());
                                      //t = t.substring(0, l); 
                                       sb.append(t + " ");
                                       at = at+t;
                                   sb.append(at + " ");

                                if (term.contains(".") && !term.matches(".*\\d+.*")) {//&& StringUtils.isAlpha(term)){
                                    term = term.replaceAll("\\.", "");
                                //int l = stemmer.stem(charTermAttr.toString().toCharArray(), charTermAttr.toString().length());
                                int l;
                                l = stemmer.stem(term.toCharArray(), term.length());
                                sb.append(term, 0, l);

                                   String[] terms=term.split("-");
                                   String at="";
                                   for(String t : terms){
                                       sb.append(" " + t);
                                       at = at+t;
                                   sb.append(" " + at);
                                String[] hl = charTermAttr.toString().split("-");
                                if (hl.length > 1){
                                   for(int j=0; j<hl.length; j++){
                                      sb.append(" " + hl[j]);
                                   //sb.append(" " + charTermAttr.toString().split("-")[1]);

                        } catch (IOException e) {

                        ///----------END OF TExt processin----------

                        ((org.apache.lucene.document.Document) doc)
                                .add(new TextField("contents", sb.toString(), Store.YES));//new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))));
                        //doc.add(new StringField("contents", passage_content, Field.Store.YES));

                        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                            // New index, so we just add the document (no old document can be there):
                                    ".......adding " + file.getFileName().toString() + " passage " + j + "--" + n);
                        } else {
                            // Existing index (an old copy of this document may have been indexed) so 
                            // we use updateDocument instead to replace the old one matching the exact 
                            // path, if present:
                            System.out.println("updating " + file);
                            writer.updateDocument(new Term("path", file.toString()), doc);


