us.colloquy.index.IndexHandler.java Source code

Java tutorial

Introduction

Here is the source code for us.colloquy.index.IndexHandler.java

Source

/*
 * Copyright (c) 2016. Tatyana Gershkovich
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package us.colloquy.index;

import com.fasterxml.jackson.databind.ObjectWriter;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHost;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;
import us.colloquy.model.DiaryEntry;
import us.colloquy.model.DocumentPointer;
import us.colloquy.model.Letter;
import us.colloquy.model.Person;
import us.colloquy.util.*;

import java.io.*;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Stream;

import static us.colloquy.util.ElasticLoader.uploadLettersToElasticServer;

/**
 * Created by Peter Gershkovich on 12/20/15.
 */
public class IndexHandler {
    /**
     * Run this method to load typical Tolstoy letters
     * Should be run after creating tolstoy index see command below
     */

    @Test
    public void loadLetterIndex() {
        Properties properties = loadProperties();

        List<Letter> letterList = new ArrayList<>();

        List<Letter> rejectedLetters = new ArrayList<>(); //a list for rejected letter to review and improve the algorithm

        //get ll letters
        Set<DocumentPointer> documentPointers = new TreeSet<>();

        String homeDir = System.getProperty("user.home");

        //        getURIForAllLetters(documentPointers, homeDir + "/Documents/Tolstoy/unzipLettersSpecial/pisma_k_Sofie_83-84(1)", true);
        //        Person person = new Person();
        //        person.setLastName("??");
        //        person.setFirstName("?");
        //        person.setPaternalName("?");
        //        person.setOriginalEntry("?  ?");

        getURIForAllLetters(documentPointers, System.getProperty("user.home")
                + "/Documents/Tolstoy/unzipLettersSpecial/Pisma_k_Chertkovu_toma_87-89", true);
        getURIForAllLetters(documentPointers, System.getProperty("user.home")
                + "/Documents/Tolstoy/unzipLettersSpecial/Pisma_k_Chertkovu_toma_87-89", true);
        Person person = new Person();
        person.setLastName("");
        person.setFirstName("");
        person.setPaternalName("");
        person.setOriginalEntry(" ..");

        //        getURIForAllLetters(documentPointers, System.getProperty("user.home") + "/Documents/Tolstoy/unzipLetters", false);
        //
        //        Person person = null;

        Map<String, String> toWhomMap = null;

        //load map of addressees
        try {

            toWhomMap = ResourceLoader.loadToWhomMap("references/toWhom.json");

        } catch (Exception e) {
            e.printStackTrace();
        }

        for (DocumentPointer pointer : documentPointers) {
            if (toWhomMap != null) {
                LetterParser.parseLetters(pointer, letterList, person, toWhomMap, rejectedLetters); //test case "temp/OEBPS/Text/0001_1006_2002.xhtml"
            } else {
                System.out.println("ToWhom Map must be initialized.");
            }

        }

        System.out.println("Total number of letters: " + letterList.size());

        Set<String> locations = new TreeSet<>();

        //code below to check a few letters

        int i = 0;

        for (Letter letter : letterList) {
            i++;

            if (letter.getDate() == null) {
                System.out.println(" ------------------------------------------------------------------");
                System.out.println(letter.toString());

            }

            if (StringUtils.isNotEmpty(letter.getPlace())) {

                locations.add(letter.getPlace());

            } else {
                System.out.println(" ------------------------------------------------------------------");
                System.out.println(letter.toString());

            }

            //            if (i > 15)
            //            {
            //                break;
            //            }
        }

        for (String loc : locations) {
            System.out.println(loc);
        }

        // write your code here
        //        Settings settings = Settings.settingsBuilder()
        //                .put("cluster.name", properties.getProperty("elastic_cluster_name")).build();
        //
        //        //open index balk load all letters and process them
        //        try (Client client = TransportClient.builder().settings(settings).build()
        //                .addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(properties.getProperty("elastic_ip_address")), 9300)))
        //        {

        if (properties.getProperty("upload_to_elastic").equalsIgnoreCase("true")) {
            uploadLettersToElasticServer(properties, letterList);
        }
    }

    @Test
    public void createLettersIndex() throws Exception {
        try (RestHighLevelClient client = new RestHighLevelClient(RestClient
                .builder(new HttpHost("localhost", 9200, "http"), new HttpHost("localhost", 9201, "http")))) {

            IndexingUtils.makeIndex("lntolstoy-letters", "letters", client);
        }
    }

    @Test
    public void createDiariesIndex() throws Exception {
        try (RestHighLevelClient client = new RestHighLevelClient(RestClient
                .builder(new HttpHost("localhost", 9200, "http"), new HttpHost("localhost", 9201, "http")))) {

            IndexingUtils.makeIndex("lntolstoy-diaries", "diaries", client);
        }
    }

    @Test
    public void loadDiariesIndex() {
        Properties properties = loadProperties();

        List<DiaryEntry> diaryEntries = new ArrayList<>();

        List<DocumentPointer> documentPointers = new ArrayList<>();

        //  getURIForAllDiaries(documentPointers, System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries");
        //
        String strPathToDiaries = "samples/diaries";

        //find all volumes with diaries
        Path pathToDiaries = FileSystems.getDefault().getPath(strPathToDiaries);

        List<Path> listOfDiaryVolumes = new ArrayList<>();

        int maxDepth = 1;

        try (Stream<Path> stream = Files.find(pathToDiaries, maxDepth, (path, attr) -> {
            return String.valueOf(path).contains("dnevnik");
        })) {
            stream.forEach(listOfDiaryVolumes::add);

        } catch (IOException e) {
            e.printStackTrace();
        }

        int diaryCounter = 0;

        File entries_debug = new File("entries_debug.txt");

        entries_debug.delete();

        try (FileWriter fw = new FileWriter("entries_debug.txt", true);
                BufferedWriter bw = new BufferedWriter(fw);
                PrintWriter outDebug = new PrintWriter(bw)) {
            outDebug.println("start file");
            //more code

            for (Path path : listOfDiaryVolumes) {
                documentPointers.clear();

                getURIForAllDiaries(documentPointers, path);

                diaryEntries.clear();

                for (DocumentPointer pointer : documentPointers) {
                    List<DiaryEntry> diaryEntriesInPointer = new ArrayList<>();

                    DiaryParser.parseDiaries(pointer, diaryCounter++, diaryEntriesInPointer, outDebug); //test case "temp/OEBPS/Text/0001_1006_2002.xhtml"

                    //find if anything is wrong in a pointer

                    for (DiaryEntry diaryEntry : diaryEntriesInPointer) {
                        if (diaryEntry.getDate() == null || diaryEntry.getEntry().length() < 20) {
                            System.out.println(" Missing entry:  --------------- " + pointer.getUri()
                                    + " ---------------------------------");
                            System.out.println(diaryEntry.toString());

                        }
                    }

                    diaryEntries.addAll(diaryEntriesInPointer);
                }

                System.out.println("Total number of diaries in " + path.getFileName() + ": " + diaryEntries.size());

                //code below to set ids and check a few letters
                int i = 0;

                for (DiaryEntry diaryEntry : diaryEntries) {
                    i++;

                    String id = "D-" + path.getFileName().toString().replaceAll(".*_", "") + "-" + i;

                    diaryEntry.setId(id);
                }

                System.out.println("Total number of diaries in volume: " + diaryEntries.size());

                //export json files

                try {
                    ObjectWriter ow = new com.fasterxml.jackson.databind.ObjectMapper().writer()
                            .withDefaultPrettyPrinter();

                    String json = ow.writeValueAsString(diaryEntries);

                    System.out.println("-------------------- Start exporting diaries in Volume "
                            + path.getFileName() + " ------------- ");

                    //   String origin = diaryEntries.get(0).getSource();

                    String fileName = "parsed/diaries/" + path.getFileName() + ".json";

                    try (Writer out = new BufferedWriter(
                            new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8"))) {
                        out.write(json);

                    }

                    System.out.println("-------------------- End of export in Volume " + path.getFileName()
                            + " ------------- ");

                } catch (Exception e) {
                    e.printStackTrace();
                }

                if (properties.getProperty("upload_to_elastic").equalsIgnoreCase("true")) {

                    ElasticLoader.uploadDiariesToElasticServer(properties, diaryEntries);
                }
            }

            outDebug.println("end file");
            //more code
        } catch (IOException e) {
            //exception handling left as an exercise for the reader
        }

    }

    public void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory, boolean useOnlyNumber) {
        ///Documents/Tolstoy/diaries

        Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory);

        List<Path> results = new ArrayList<>();

        int maxDepth = 6;

        try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
            return String.valueOf(path).endsWith(".ncx");
        })) {

            stream.forEach(results::add);

            //            String joined = stream
            //                    .sorted()
            //                    .map(String::valueOf)
            //                    .collect(Collectors.joining("; "));
            //
            //            System.out.println("\nFound: " + joined);

        } catch (IOException e) {
            e.printStackTrace();
        }

        System.out.println("files: " + results.size());

        try {

            for (Path res : results) {
                Path parent = res.getParent();

                //                System.out.println("---------------------------------------------");
                //                System.out.println(parent.toString());
                //use jsoup to list all files that contain something useful
                Document doc = Jsoup.parse(res.toFile(), "UTF-8");

                String title = "";

                for (Element element : doc.getElementsByTag("docTitle")) {
                    //Letter letter = new Letter();

                    // StringBuilder content = new StringBuilder();

                    for (Element child : element.children()) {
                        title = child.text();
                        // System.out.println("Title: " + title);
                    }
                }

                for (Element element : doc.getElementsByTag("navPoint")) {
                    //Letter letter = new Letter();

                    // StringBuilder content = new StringBuilder();

                    for (Element child : element.children()) {
                        String label = child.text();

                        if (StringUtils.isNotEmpty(label)) {
                            if (label.matches("?")) {
                                System.out.println("------------------");
                            }

                            String url = child.getElementsByTag("content").attr("src");

                            if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) {
                                DocumentPointer documentPointer = new DocumentPointer(
                                        parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                                uriList.add(documentPointer);
                                //                                System.out.println("nav point: " + label + " src " + parent.toString()
                                //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                            } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url)
                                    && useOnlyNumber) {
                                DocumentPointer documentPointer = new DocumentPointer(
                                        parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                                uriList.add(documentPointer);
                                //                                System.out.println("nav point: " + label + " src " + parent.toString()
                                //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                            } else {
                                // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                            }

                        }
                    }
                }

            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        //        System.out.println("Size: " + uriList.size());

        //        for (DocumentPointer pointer : uriList)
        //        {
        //            //parse and
        //            System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
        //        }
    }

    public Properties loadProperties() {
        Properties properties = new Properties();

        try (InputStream propertiesFileInputStream = IndexHandler.class.getClassLoader()
                .getResourceAsStream("properties.xml")) {

            if (propertiesFileInputStream != null) {
                properties.loadFromXML(propertiesFileInputStream);

                propertiesFileInputStream.close();

            } else {

                System.out.print("Property file not available");

            }

        } catch (Exception e) {
            e.printStackTrace();

        }
        return properties;
    }

    public void getURIForAllDiaries(List<DocumentPointer> documentPointers, Path pathToLetters) {
        List<Path> results = new ArrayList<>();

        int maxDepth = 6;

        try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
            return String.valueOf(path).endsWith(".ncx");
        })) {

            stream.forEach(results::add);

        } catch (IOException e) {
            e.printStackTrace();
        }

        System.out.println("files: " + results.size());

        try {

            for (Path res : results) {
                Path parent = res.getParent();

                //                System.out.println("---------------------------------------------");
                //                System.out.println(parent.toString());
                //use jsoup to list all files that contain something useful
                Document doc = Jsoup.parse(res.toFile(), "UTF-8");

                String title = "";

                for (Element element : doc.getElementsByTag("docTitle")) {
                    //Letter letter = new Letter();

                    // StringBuilder content = new StringBuilder();

                    for (Element child : element.children()) {
                        title = child.text();
                        // System.out.println("Title: " + title);
                    }
                }

                //  System.out.println("==========================   " + res.toString() + " ==========================");

                boolean startPrinting = false;

                boolean newFile = true;

                for (Element element : doc.getElementsByTag("navPoint")) {

                    //get nav label and content

                    Element navLabelElement = element.select("navLabel").first();
                    Element srsElement = element.select("content").first();

                    String navLabel = "";
                    String srs = "";

                    if (navLabelElement != null) {
                        navLabel = navLabelElement.text().replaceAll("\\*", "").trim();
                    }

                    if (srsElement != null) {
                        srs = srsElement.attr("src");
                    }

                    if ("??".matches(navLabel))

                    {
                        startPrinting = false;

                        // System.out.println("----------------- end of file pointer ---------------");
                    }

                    if (StringUtils.isNotEmpty(navLabel)
                            && navLabel.matches("??.*|?? ?.*") && newFile) {
                        newFile = false;
                        startPrinting = true;
                    }

                    if (startPrinting && !navLabel
                            .matches("(|??? ??)")) {
                        // System.out.println("----------------- file pointer ---------------");
                        //   System.out.println(navLabel + "\t" + srs);

                        DocumentPointer documentPointer = new DocumentPointer(
                                parent.toString() + File.separator + srs.replaceAll("#.*", ""), title);

                        documentPointers.add(documentPointer);
                    }

                }

                //   System.out.println("==========================   END OF FILE ==========================");

            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        System.out.println("Size: " + documentPointers.size());

        //  for (DocumentPointer pointer : documentPointers)
        // {
        //parse and
        //     System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
    }

}