us.colloquy.util.DiaryParser.java Source code

Java tutorial

Introduction

Here is the source code for us.colloquy.util.DiaryParser.java

Source

    /*
     * Copyright (c) 2016. Tatyana Gershkovich
     *
     * Licensed under the Apache License, Version 2.0 (the "License");
     * you may not use this file except in compliance with the License.
     * You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */

    package us.colloquy.util;

    import org.jsoup.nodes.TextNode;
    import org.jsoup.select.Elements;
    import us.colloquy.model.DiaryEntry;
    import us.colloquy.model.DocumentPointer;
    import org.apache.commons.lang.StringUtils;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.junit.Test;

    import java.io.File;
    import java.io.IOException;
    import java.io.PrintWriter;
    import java.time.LocalDate;
    import java.time.ZoneId;
    import java.util.ArrayList;
    import java.util.Date;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;

import static us.colloquy.util.CommonTextParser.ldCyrillicFilter;

    /**
     * Created by Peter Gershkovich on 12/27/15.
     */
    public class DiaryParser {

        ///Documents/Tolstoy/openDiaries/dnevnik_1893(2)/OEBPS/Text/0001_1006_2001.xhtml

        private static Pattern yearPattern = Pattern.compile(".{0,20}(\\d{4}).{0,20}");

        @Test
        public void useJsoup() {
            //File input = new File(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries/dnevnik_1893(2)/OEBPS/Text/0001_1006_2001.xhtml");
            //   File input = new File(System.getProperty("user.home") + "/IdeaProjects/ElasticTest/temp/dnevnik_1862(1)/OEBPS/Text/0001_1006_2001.xhtml");

            File input = new File(System.getProperty("user.home")
                    + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49/OEBPS/Text/0001_1011_2005.xhtml");

            String previousYear = "";

            String sourse = "pointer";

            List<DiaryEntry> diaryEntrys = new ArrayList<>();

            try {
                Document doc = Jsoup.parse(input, "UTF-8");

                for (Element element : doc.getElementsByClass("section")) {
                    DiaryEntry diaryEntry = null;

                    StringBuilder contentBuilder = new StringBuilder();

                    for (Element child : element.children()) {
                        //                    for (Attribute att : child.attributes())
                        //                    {
                        //                        //   System.out.println(att.getKey() + " " + att.getValue());
                        //                    }
                        //we need to assume that each element is a continuation unless the entry is a date that starts a new entry
                        //the problem is to distinguish between an entry that contains date and place vs date within an entry

                        //lets try to see if element is a date

                        DiaryEntry diaryEntryToCollectDate = new DiaryEntry();

                        //we send it in two cases when text matches year or when text has em element
                        Element em = child.select("em").first();

                        if (em == null && StringUtils.isNotEmpty(child.text())) {
                            Matcher m = yearPattern.matcher(child.text());

                            if (m.find()) {
                                child.text(m.group(1));
                                previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child);
                            }
                        }

                        if (em != null) {
                            previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child);
                        }

                        if (diaryEntryToCollectDate.getDate() != null) //this is the begginng of a new entry
                        {
                            System.out.println("Found date: " + diaryEntryToCollectDate.getDate());
                            //create new DiaryEntry
                            if (diaryEntry != null) {
                                diaryEntry.setEntry(contentBuilder.toString()); //add consecutive entries here
                                diaryEntrys.add(diaryEntry);
                            }

                            diaryEntry = new DiaryEntry();
                            diaryEntry.setSource(sourse);
                            diaryEntry.setDate(diaryEntryToCollectDate.getDate());
                            diaryEntry.setPlace(diaryEntryToCollectDate.getPlace());

                            contentBuilder = new StringBuilder();

                        }

                        if (StringUtils.isNotEmpty(child.text()) && child.text().length() > 8) {
                            contentBuilder.append(child.text() + "\n");

                        }
                        //
                        //                    System.out.println(child.tag() + "\n");
                        //                    System.out.println(child.outerHtml() + "\n" + child.text());
                    }

                    //whatever we still have, add here:
                    if (StringUtils.isNotEmpty(contentBuilder.toString()) && diaryEntry != null) {
                        diaryEntry.setEntry(contentBuilder.toString());
                        diaryEntrys.add(diaryEntry);
                    }
                }

            } catch (IOException e) {
                e.printStackTrace();
            }

            for (DiaryEntry diaryEntry : diaryEntrys) {
                System.out.println(diaryEntry.toString());
            }
        }

    /**
     * This is the hart of parsing process
     *  @param documentPointer
     * @param diaryEntries
     * @param outDebug
     */
    public static void parseDiaries(DocumentPointer documentPointer, int pointerCounter, List<DiaryEntry> diaryEntries, PrintWriter outDebug)
    {
        File input = new File(documentPointer.getUri());

        String previousYear = "";

        Date previousDate = null;

        try
        {
            Document doc = Jsoup.parse(input, "UTF-8");

            Elements elements = doc.getElementsByTag("title");


            //exclude irrelevant sections
            if (elements != null)
            {
                if (elements.size() > 1)
                {//stop
                   String stop =" x";
                }

                for (Element element : elements)
                {
                    String title =  element.text();
                    //looks like one file has only one title!
                    if (title.contains("") || title.contains("???") || title.contains("??") ||
                            title.contains("??  ?? ? "))
                    {
                        return;   //no need for these documents
                    }

                   // System.out.println(documentPointer.getUri() + "\t" + element.text());
                }
            }


            //process the rest

            int elementCounter = 0;

            for (Element element : doc.getElementsByClass("section"))   //now for each section in a document
            {
                DiaryEntry diaryEntry = null;

                StringBuilder contentBuilder = new StringBuilder();
                              
                for (Element child : element.children())
                {
                    replaceSupTag(child);  //replace all notes numbers written as superscript

                    DiaryEntry diaryEntryToCollectDate = new DiaryEntry();

                    //we send it in two cases when text matches year or when text has em element
                    Element em = child.select("em").first();

                    if (em == null && StringUtils.isNotEmpty(child.text()))  //this occur in section heading and is helpful to determine year since Tolstoy either did not mention it. It was implied. There was no year 1900 bug.

                    {
                        Matcher m = yearPattern.matcher(child.text());

                        if (m.find())
                        {
                            //sanity check
                            int year = Integer.valueOf(m.group(1));
                            {
                                if (year > 1840 && year < 1911)
                                {
                                    child.text(year + "");

                                    String prevYearTmp = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child);

                                    if (StringUtils.isNotEmpty(prevYearTmp))
                                    {
                                        previousYear = prevYearTmp;

                                    }
                                }
                            }


                        }
                    }

                    if (em != null && child.html().startsWith("<em>"))
                    {
                        //get first em tag of a paragraph

                       String fistEmTag =  ldCyrillicFilter(em.text()).replaceAll("(\\[|\\])","").replaceAll("\\."," ").replaceAll("(?iu)??","").replaceAll("(?iu)","").trim();

//                        System.out.println((fistEmTag) + "\tFirst em tag" );

                        //use this first tag to figure out date

                        String emId = (pointerCounter + "-" + (elementCounter++));

                        previousDate = parseDateOnly(previousDate, diaryEntry, fistEmTag, emId, outDebug);

                        //logic is to make sure it is close to the previous date unless it is pretty clear.
                        //let's find out if the gap between this date and previous date is more than a month and if a date is earlier than the one found before.


                        String prevYearTmp = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child);

                        if (StringUtils.isNotEmpty(prevYearTmp))
                        {
                            previousYear = prevYearTmp;
                        }
                    }

                    if (diaryEntryToCollectDate.getDate() != null) //this is the beginng of a new entry
                    {
                        // System.out.println("Found date: " + diaryEntryToCollectDate.getDate());
                        //create new DiaryEntry
                        if (diaryEntry != null)
                        {
                            String content = contentBuilder.toString().trim();

                            if (StringUtils.isNotEmpty(content) && content.length() > 35
                                    && diaryEntry.getDate() != null)
                            {
//                                if (content.matches("(?smi).*1847.*"))
//                                {
//                                    String stop = "";
//                                }

                                diaryEntry.setEntry(ldCyrillicFilter(content));
                                diaryEntries.add(diaryEntry);                     //add consecutive entries here

                            }  else if (content.length() > 0 && content.length() > 35)
                            {
                                System.out.println("removed entries: " + content);
                            }
                            
                        }

                        diaryEntry = new DiaryEntry();
                        diaryEntry.setSource(ldCyrillicFilter(documentPointer.getSourse()));
                        diaryEntry.setDate(diaryEntryToCollectDate.getDate());
                        diaryEntry.setPlace(ldCyrillicFilter(diaryEntryToCollectDate.getPlace()));

                        contentBuilder = new StringBuilder();

                    }

                    if (StringUtils.isNotEmpty(child.text()) && child.text().trim().length() > 0)
                    {
                        contentBuilder.append(ldCyrillicFilter(child.text()) + "\n");

                    }
//
//                    System.out.println(child.tag() + "\n");
//                    System.out.println(child.outerHtml() + "\n" + child.text());
                }

                //whatever we still have, add here:
                String content = contentBuilder.toString().trim();

                if (StringUtils.isNotEmpty(content) && content.length() > 35  && diaryEntry != null
                        && diaryEntry.getDate() != null)
                {

                    diaryEntry.setEntry(content);
                    diaryEntries.add(diaryEntry);
                }

            }

        } catch (IOException e)
        {
            e.printStackTrace();
        }

        System.out.println("------------------------------   " + documentPointer.getUri() + "   ----------------------------");

        //  System.out.println(documentPointer.toString() + " Letters: " + diaryEntries.size() + " Rejected letters: " + rejectedEntries.size());

//        for (DiaryEntry diaryEntry : diaryEntries)
//        {
//            System.out.println(diaryEntry.toString());
//        }

        System.out.println("------------------------------  total documents cumulative " + diaryEntries.size() + "   ----------------------------");


    }

        private static void replaceSupTag(Element child) {
            Elements elements = child.getElementsByTag("sup");

            for (Element e : elements) {
                String value = e.text();

                e.replaceWith(new TextNode("[" + value + "]", null));
            }

        }

    private static String parseDateAndPlace(String previousYear, DiaryEntry diaryEntry, Element child)
    {
        Elements elements = child.getElementsByTag("em"); //we need only first em

        if ( elements != null && elements.size() > 0 && StringUtils.isNotEmpty(elements.get(0).text()))
        {
            String letterDatePlace = ldCyrillicFilter(elements.get(0).text()).replaceAll("\\*\\[\\]", "");

            RussianDate.parseDateAndPlace(diaryEntry, letterDatePlace, previousYear);



            if (diaryEntry.getDate() != null)
            {
                LocalDate localDate = diaryEntry.getDate().toInstant().atZone(ZoneId.systemDefault()).toLocalDate();
                int year = localDate.getYear();
                previousYear = year + "";
            }
        } else if (StringUtils.isNotEmpty(child.text()))
        {
            String letterDatePlace = ldCyrillicFilter(child.text()).replaceAll("\\*\\[\\]", "");

            RussianDate.parseDateAndPlace(diaryEntry, letterDatePlace, previousYear);

            if (diaryEntry.getDate() != null)
            {
                LocalDate localDate = diaryEntry.getDate().toInstant().atZone(ZoneId.systemDefault()).toLocalDate();
                int year = localDate.getYear();
                previousYear = year + "";
            }
        }

        return previousYear;
    }

        private static Date parseDateOnly(Date previousDate, DiaryEntry diaryEntry, String text, String entryId,
                PrintWriter outDebug) {

            return RussianDate.parseDate(previousDate, diaryEntry, text, entryId, outDebug);

        }

    @Test
    public void testOldCyrillicFilter()
    {

        ldCyrillicFilter("  ??   ? ? ?     ?  ?? ?, ?  ?  ? ? ? ? ? ?- ???. ?   ? ?    ?  ?, ??,      ??    ? ???  ? .");

    }
    }