GIST.IzbirkomExtractor.TableExtractor.java Source code

Introduction

Here is the source code for GIST.IzbirkomExtractor.TableExtractor.java
Source

    /**
     * Copyright  2013 , UT-Battelle, LLC
     * All rights reserved
     *                                        
     * JavaParserOfAddresses, Version 1.0
     * http://github.com/sorokine/JavaParserOfAddresses
     * 
     * This program is freely distributed under UT-Batelle, LLC
     * open source license.  Read the file LICENSE.txt for details.
     */

    package GIST.IzbirkomExtractor;

    import java.io.File;
    import java.io.IOException;
    import java.sql.SQLException;
    import java.text.Normalizer;
    import java.util.ArrayList;
    import java.util.logging.Logger;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;

    import org.apache.commons.lang3.StringUtils;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;

    /**
     * @author Alex Sorokine <sorokine@gmail.com>
     *
     */
    public class TableExtractor {

        /**
         * Main logger
         */
        private final static Logger logger = Logger.getLogger("ALL");

        private AddressParserManager apm;
        private AddressMatcher addressMatcher;
        private ResultSink resultSink;

        public TableExtractor(AddressMatcher addressMatcher, ResultSink resultSink)
                throws InstantiationException, IllegalAccessException {
            this.addressMatcher = addressMatcher;
            this.resultSink = resultSink;
            this.apm = new AddressParserManager();
            apm.loadAllParsers();
        }

        public void processHTMLfile(File input_html) throws IOException, TableExtractorException,
                CloneNotSupportedException, SQLException, ResultSinkException {

            logger.info("Start processing " + input_html);

            Document doc = Jsoup.parse(input_html, "UTF-8");
            Elements tables = doc.getElementsByTag("table");

            /* count of parseable tables found */
            int tables_found = 0;

            /* determine raion name */
            String raion_name = extractRaionFromFileName(input_html.getName());
            //System.err.println(raion_name);

            // TODO: inflect raion name in  case

            /* searches for a table that has " . -" in its very 1st cell */
            for (Element table : tables) {
                Elements rows = table.getElementsByTag("tr");
                boolean firstRow = true;

                row_loop: for (Element row : rows) {
                    Elements cells = row.getElementsByTag("td");

                    if (firstRow) {
                        //System.err.println(row.text());
                        if (isParsableTable(row)) {
                            firstRow = false;
                            logger.info("Processing table #" + ++tables_found + " in " + input_html);
                        } else
                            break row_loop;
                    }

                    if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()),
                            " . -") < 3)
                        continue row_loop; /* skip the row if it looks like a table header */

                    /* skip rows with all cells empty */
                    boolean emptyRow = true;
                    for (Element cell : cells)
                        emptyRow = emptyRow && cleanupUNICODE(cell.text()).isEmpty();
                    if (emptyRow)
                        continue;

                    int i_cell = 0;
                    Element station_id = null;
                    Element address_field = null;
                    Element org_address = null; /* address of the ??? */
                    Element station_address = null;

                    for (Element cell : cells) {
                        switch (i_cell) {
                        case 0:
                            station_id = cell;
                            break;
                        case 1:
                            address_field = cell;
                            break;
                        case 2:
                            org_address = cell;
                            break;
                        case 3:
                            station_address = cell;
                        default:
                            break;
                        }
                        i_cell++;
                    }

                    if (station_id == null)
                        throw new TableExtractorException("Polling station ID not found", row, input_html);
                    if (address_field == null)
                        throw new TableExtractorException("Address list not found", row, input_html);

                    /* extract int from poll station id */
                    int psid;
                    try {
                        psid = Integer.valueOf(cleanupUNICODE(station_id.text()).trim().replaceAll("[^\\d]", ""));
                    } catch (NumberFormatException e) {
                        Exception te = new TableExtractorException("Failed to parse polling station ID >"
                                + cleanupUNICODE(station_id.text()).trim() + "<: ", station_id, input_html);
                        logger.severe(te.getMessage() + "; rest of " + input_html + " ignored.");
                        return;
                    }

                    /* extraction from HTML completely finished, now we work only with the addresses in the text form */
                    extractAddressesFromText(raion_name.trim(), psid, cleanLeftoverHTML(address_field),
                            cleanLeftoverHTML(org_address), cleanLeftoverHTML(station_address));
                }
            }

            if (tables_found == 0)
                logger.severe("No parsable tables found in " + input_html);
            resultSink.commit();

            logger.info("" + tables_found + " table(s) processed in " + input_html);
        }

        /**
         * Cleaning up leftover of HTML code from the cell content.
         * 
         * @param cell_content HTML code contains in the table cell 
         * @return an array list containing each line of the cell_content withh all HTML markup removed
         */
        private ArrayList<String> cleanLeftoverHTML(Element cell_content) {

            ArrayList<String> streets_and_numbers = new ArrayList<String>();

            /* <div>s designate separate lines inside the table cell */
            for (Element addr_line : cell_content.getElementsByTag("div")) {

                /* skip empty address lines */
                String addr_line_text = cleanupUNICODE(addr_line.text());
                if (StringUtils.isBlank(addr_line_text))
                    continue;

                /* <strong> is not particularly useful, but can designate placement of simple separators like space */
                Elements streets = addr_line.getElementsByTag("strong");
                if (!streets.isEmpty()) {
                    addr_line_text = addr_line_text.replaceFirst(Pattern.quote(streets.text()),
                            " " + streets.text() + " ");
                }

                streets_and_numbers.add(addr_line_text);
            }
            return streets_and_numbers;
        }

   /**
    * Extracts raion name for 'like' query on name or okato columns in the database.
    * @param fileName file name in the form "?   ? ?? .html"
    * @return
    */
   private static String extractRaionFromFileName(String fileName) {
      /**
       * You probably wondering what NFC is and what the next line is doing.
       * It is only relevant to OSX file names.  For details see
       * http://stackoverflow.com/questions/3610013/file-listfiles-mangles-unicode-names-with-jdk-6-unicode-normalization-issues
       */
      String fileNameNFC = Normalizer.isNormalized(fileName, Normalizer.Form.NFC) ? 
            fileName : Normalizer.normalize(fileName, Normalizer.Form.NFC);
      return 
            fileNameNFC.replaceFirst("\\.html?$", ""). /* remove parts of the file name */
            replaceFirst("^?  .* ?\\S+\\s+", "").
//            replaceAll("\\s*\\b\\b\\s*", ""). /* THIS IS NFD FROM remove word "" and spaces*/
            replaceAll("\\s*\\b\\b\\s*", ""). /* THIS IS NFC FORM remove word "" and spaces*/
            replace('', '').replace('?', ''). /* replace ? */
            replaceAll("(||||)$", ""); /* remove ? , , ,  */
   }

        /**
         * For tests, ignore.
         * @param argv
         */
        public static void main(String[] argv) {
            System.out.println(extractRaionFromFileName(
                    "?   ?  .html"));
        }

   /**
    * Address tokenizer 
    * @param psid polling station ID
    * @param raion_name  
    *         
    * @param streets_and_numbers
    * @param station_address 
    * @param org_address 
    * @throws CloneNotSupportedException
    * @throws IOException 
    * @throws SQLException 
    * @throws ResultSinkException 
    */
   private void extractAddressesFromText(
         String raion_name, 
         int psid, 
         ArrayList<String> streets_and_numbers, 
         ArrayList<String> org_address, 
         ArrayList<String> station_address
      ) throws CloneNotSupportedException, IOException, ResultSinkException, SQLException {
      
      /* split the address line on ; in case there is a street name somewhere in the middle; works for most cases */
      ArrayList<String> streets_and_numbers_split = new ArrayList<String>();
      for (String addr_line_text_raw : streets_and_numbers) {
         
         // FIXME: move cleanup to later parsing stages
         String addr_line_text = cleanupUNICODE(addr_line_text_raw).
               replace('.', ' '). // get rid of all periods 
               replaceAll("[:;]\\s*$", ""). // remove spaces and semicolons at the end of the string, colon is a typo fix
               replaceAll("\\s* ?\\s*", " ").
               replaceAll("\\s*\\s ", " ").
               replaceAll("\\s*\\s*", " ").
               replaceAll("\\s*? ?\\s*", " ").
               replace('', '').replace('?', ''). // an ugly way to handle 
               replaceAll("??", "??").
               replaceAll("\\s*(?|)\\s", "; $1 "). // fixes some missed spaces
               replace("?", "?").
               replace("()", "").
               replaceAll("??", "??").
               replaceAll("\\b?\\b", "??").
               replaceAll("???", "???").
               replaceAll("\\s+", " ").trim(); // remove spurious spaces (has to be the last action) 
         
         logger.fine("Levenshtein=" + StringUtils.getLevenshteinDistance(addr_line_text, addr_line_text_raw) + 
               ": " + addr_line_text_raw + "' => '" + addr_line_text +   "'");
         
         String addr_parts[] = addr_line_text.split("\\s*;\\s*");
         if (addr_parts.length > 0) {
            for (String s : addr_parts) {
               if (s.isEmpty()) continue;
               streets_and_numbers_split.add(s);
            }
         } else
            streets_and_numbers_split.add(addr_line_text);
      }
      
      /* fixes for incorrectly split lines */
      ArrayList<String> streets_and_numbers_fixed = new ArrayList<String>();
      streets_and_numbers_fixed.add(streets_and_numbers_split.get(0));

      /* these are the characters that can only be found in house number part */
      Pattern house_number_chars_pattern = Pattern.compile("[^0-9(),\\s/;??-\\-]");          
      
      for (int i = 1; i < streets_and_numbers_split.size(); i++) {
         String cur_line = streets_and_numbers_split.get(i);

         /* last element number of the array of fixed addresses */
         int last_el = streets_and_numbers_fixed.size() - 1;
         String prev_line = streets_and_numbers_fixed.get(last_el);
         
         /* ? ? and addresses in ?  can be broken across lines */
         if (prev_line.endsWith("?") && cur_line.startsWith("?") ||
               prev_line.endsWith("") && cur_line.matches("^\\d-??\\s*?.+$")) {
            
            streets_and_numbers_fixed.set(last_el, prev_line + " " + cur_line);
            continue;
         } 
         
         /* if a line looks like house numbers only, append it to the previous string */
         Matcher not_only_house_numbers = house_number_chars_pattern.matcher(cur_line);
         if (!not_only_house_numbers.find()) { // double negation  
            
            String infix = prev_line.matches(",\\s*$") || cur_line.matches("^\\s*,") ? "" : ", ";
            streets_and_numbers_fixed.set(last_el, prev_line + infix + cur_line);
            continue;
         } 
         
         streets_and_numbers_fixed.add(cur_line);
      }
      
      for (String street_name_and_numbers : streets_and_numbers_fixed) {
         
         ArrayList<ParsedAddress> pas = apm.parseThroughAllParsers(
               street_name_and_numbers, 
               StringUtils.join(org_address.toArray(), " "),
               StringUtils.join(station_address.toArray(), " "));
         if (pas.size() == 0) {
            logger.severe("Complete parse failure: '" + street_name_and_numbers + 
                  "' src='" + StringUtils.join(streets_and_numbers, '|') + 
                  "' from " + raion_name);
            continue;
         }
         //System.out.println(pa);
         
         /* matched individual addresses */
         ArrayList<Iterable<IndividualAddress>> matched_ias = new ArrayList<>(pas.size());
         int[] matches = new int[pas.size()]; // count of successful matches per ParsedAddress 
         int matched_parsers = 0; // number of parsers that had successful matches
         
         for (ParsedAddress pa : pas) {
            Iterable<IndividualAddress> ias = pa.enumerateIndividualAddresses();
            matched_ias.add(ias);
            
            if (ias != null)
               for (IndividualAddress individualAddress : ias) {
                  if (individualAddress == null) continue;
                  
                  individualAddress.setRaion(raion_name);
                  individualAddress.setPsid(psid);
                  individualAddress.setOtherParses(pas);
               
                  /* address matcher updates individual address */
                  addressMatcher.matchAddress(individualAddress);
                  logger.info("" + individualAddress);

                  if (individualAddress.getOsmid().size() > 0)
                     matches[matched_ias.size()-1]++;
               } 
            else {
               logger.severe("NULL IndividualAddress list in " + pa);
               return;
            }
            
            // increase count of parsers with successful matches
            if (matches[matched_ias.size()-1] > 0)
               matched_parsers++;
         }

         // set secondary match counts
         for (ParsedAddress pa : pas)
            pa.setParserMatchCount(matched_parsers);
         
         /* save result from the highest-priority parser */
         boolean resultPosted = false;
         for (int i = 0; i < matches.length; i++) {
            if (matches[i] > 0) 
               if (resultPosted)
                  for (IndividualAddress ia : matched_ias.get(i)) 
                     logger.info("Matches in multiple parses in " + ia);
               else {
                  for (IndividualAddress ia : matched_ias.get(i)) 
                     resultSink.postResult(ia);
                  resultPosted = true;
               }
         }
         
         /* if no addresses were matched post results from the highest priority parser */
         if (!resultPosted)
            for (IndividualAddress ia : matched_ias.get(0)) 
               resultSink.postResult(ia);
            
      }
   }

        /**
         * Tests the row if it looks like the 1st row of a parsable table
         * @param row
         * @return
         */
        private boolean isParsableTable(Element row) {

            Elements cells = row.getElementsByTag("td");

            /* number of columns should be 4 */
            if (cells.size() != 4)
                return false;

            /* look for number signs in 1st cell*/
            if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()),
                    " . -") < 3)
                return true;

            /* discard the table if any of the cells is empty */
            for (Element cell : cells) {
                if (cleanupUNICODE(cell.text()).isEmpty())
                    return false;
            }

            /* 1st column should be a number */
            try {
                Integer.parseInt(cleanupUNICODE(cells.first().text()).trim());
                return true;
            } catch (NumberFormatException e) {
                return false;
            }
        }

        /**
         * Replaces strange UNICODE symbols with ASCII codes, trims leading and trailing spaces.  The symbols are:
         *  spaces
         *  dash
         * @param text
         * @return
         */
        private static String cleanupUNICODE(String text) {
            return text.replaceAll("[\\s\\xA0]+", " ").replaceAll("\\s*[\\-]\\s*", "-").trim();
        }

    }