Java tutorial
/* Copyright (c) 2011 Danish Maritime Authority * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 3 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this library. If not, see <http://www.gnu.org/licenses/>. */ package dk.dma.msinm.legacy.nm; import dk.dma.msinm.common.util.TextUtils; import dk.dma.msinm.model.*; import org.apache.commons.lang.StringUtils; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.text.NumberFormat; import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Massive class for extracting NM messages from PDF files. * <p> * The format of the PDF files is that of: * http://www.soefartsstyrelsen.dk/AdvarslerEfterretninger/EfterretningerForSoefarende/Sider/Default.aspx * </p> */ public class NmPdfExtractor { public static final String PDF_NAME_FORMAT = "^(\\d+) EfS (\\d+).pdf$"; enum Line { PREVIOUS(10, "Tidligere EfS.", "Former EfS."), REFERENCE(20, "EfS-henvisning.", "EfS reference."), TIME(30, "Tid.", "Time."), POSITION(40, "Position.", "Position."), DETAILS(50, "Detaljer.", "Details."), NOTE(60, "Note.", "Note."), CHARTS(70, "Skort.", "Chart(s)."), PUBLICATION( 80, "Publikation.", "Publication(s)."), SOURCE(90, "(", "("); int index; String nameLocal, nameEnglish; private Line(int index, String nameLocal, String nameEnglish) { this.index = index; this.nameLocal = nameLocal; this.nameEnglish = nameEnglish; } public String getName(String lang) { return "da".equals(lang) ? nameLocal : nameEnglish; } } static class LinePart<T> { T part; String remainingLine; public LinePart(T part, String remainingLne) { this.part = part; this.remainingLine = remainingLne; } } Logger log = LoggerFactory.getLogger(NmPdfExtractor.class); String organization; InputStream inputStream; String fileName; int year, week; /** * Constructor * * @param file the PDF file */ public NmPdfExtractor(File file, String organization) throws FileNotFoundException { this(new FileInputStream(file), file.getName(), organization); } /** * Constructor * * @param inputStream the PDF input stream * @param fileName the name of the PDF file */ public NmPdfExtractor(InputStream inputStream, String fileName, String organization) { this.inputStream = inputStream; this.fileName = fileName; this.organization = organization; Matcher m = getFileNameMatcher(fileName); if (!m.matches()) { throw new IllegalArgumentException( "Invalid file name, " + fileName + ". Must have format 'yyyy EfS ww.pdf'"); } this.year = Integer.valueOf(m.group(1)); this.week = Integer.valueOf(m.group(2)); } /** * Returns a matcher for the file name * @param fileName the file name * @return the matcher */ public static Matcher getFileNameMatcher(String fileName) { Pattern p = Pattern.compile(PDF_NAME_FORMAT); return p.matcher(fileName); } public int getYear() { return year; } public int getWeek() { return week; } /** * Main method for extracting the NtM's * @param notices the list of notices to update */ public void extractNotices(List<Message> notices) throws Exception { PDDocument document = null; try { PDFTextStripper stripper = new PDFTextStripper(); document = PDDocument.load(inputStream); stripper.setStartPage(3); String text = stripper.getText(document); List<String> textBlocks = extractNoticeTextBlocks(text); extractNotices(notices, textBlocks); } catch (IOException e) { log.error("Error extracting notivces from file " + fileName, e); throw e; } finally { if (document != null) { document.close(); } try { inputStream.close(); } catch (Exception ex) { } } } /** * Chops the text into blocks of text, each representing an NtM * @param text the full text * @return the list of NtM texts */ private List<String> extractNoticeTextBlocks(String text) throws IOException { List<String> result = new ArrayList<>(); BufferedReader br = new BufferedReader(new StringReader(text)); String line; while ((line = br.readLine()) != null) { line = line.trim(); // A new block starts with the message number or "*" boolean newBlock = line.matches("^[\\d]+\\..*$"); if (!newBlock && line.length() > 0 && (int) line.charAt(0) == 61611) { newBlock = true; line = "*"; } // A translation starts with "Translation" boolean translation = line.matches("Translation"); if (newBlock || translation) { StringBuilder block = new StringBuilder(); block.append(line).append(System.lineSeparator()); while ((line = br.readLine()) != null) { // Strip header and footer, incl. blank lines if (!line.matches("^\\s+$") && !line.startsWith("Efterretninger for Sfarende, uge") && !line.startsWith("Carl Jacobsens Vej 31")) { block.append(line).append(System.lineSeparator()); } // Last line of a block is "(source)" if (line.trim().matches("\\(.+\\)")) { result.add(block.toString()); break; } } } } return result; } /** * Convert the list of NtM texts into a list of NtM's * @param notices the list of NtM's to update * @param textBlocks the list of NtM text blocks */ private void extractNotices(List<Message> notices, List<String> textBlocks) throws IOException { Message notice = null; for (String text : textBlocks) { BufferedReader br = new BufferedReader(new StringReader(text)); String line = br.readLine(); if (line.matches("Translation")) { // English translation of previous Danish notice line = br.readLine(); readNotice(br, line, "en", notice); } else { // New Danish notice notice = new Message(); notices.add(notice); if (line.matches("^\\*")) { notice.setOriginalInformation(true); line = br.readLine(); } // Extract the number LinePart<String> parts = readFirstPart(line); SeriesIdentifier id = new SeriesIdentifier(); id.setMainType(SeriesIdType.NM); id.setYear(year); id.setAuthority(organization); id.setNumber(Integer.valueOf(parts.part)); notice.setSeriesIdentifier(id); line = parts.remainingLine; readNotice(br, line, "da", notice); } } } /** * Read the notice from the reader * @param br the reader * @param line the first line of the notice * @param lang the language * @param notice the notice to update */ private void readNotice(BufferedReader br, String line, String lang, Message notice) throws IOException { MessageDesc desc = notice.checkCreateDesc(lang); LinePart<Line> nextLine; do { nextLine = readLineType(br.readLine(), lang); if (nextLine.part != null) { break; } else { line = line + " " + nextLine.remainingLine; } } while (true); // First line -> type LinePart<String> parts = readFirstPart(line); if ("(P)".equalsIgnoreCase(parts.part)) { notice.setType(Type.PRELIMINARY_NOTICE); line = parts.remainingLine; } else if ("(T)".equalsIgnoreCase(parts.part)) { notice.setType(Type.TEMPORARY_NOTICE); line = parts.remainingLine; } else { notice.setType(Type.PERMANENT_NOTICE); } // First line -> locality and title line = removeLastPeriod(line); int titleIndex = line.lastIndexOf("."); if (titleIndex > 0) { readArea(line.substring(0, titleIndex).trim(), lang, notice); desc.setTitle(line.substring(titleIndex + 1).trim()); } else { readArea(line, lang, notice); desc.setTitle(""); } // Read the notice body Line type = nextLine.part; line = nextLine.remainingLine; do { nextLine = readLineType(br.readLine(), lang); if (nextLine.part != null) { readNoticeField(type, line, lang, notice, desc); type = nextLine.part; line = nextLine.remainingLine; if (type == Line.SOURCE) { readNoticeField(type, line, lang, notice, desc); break; } } else { line = line + "\n" + nextLine.remainingLine; } } while (true); } /** * Parses the line of a given type and updates the notice accordingly * @param type the type of the line * @param line the rest of the line * @param lang the language * @param notice the notice to update * @param desc the language-specific notice descriptor */ private void readNoticeField(Line type, String line, String lang, Message notice, MessageDesc desc) { switch (type) { case CHARTS: if ("da".equals(lang)) { readCharts(removeLastPeriod(line), notice); } break; case DETAILS: desc.setDescription(TextUtils.txt2html(line)); break; case NOTE: desc.setNote(line); break; case REFERENCE: if ("da".equals(lang)) { Arrays.asList(removeLastPeriod(line).split(",|( and )|( og )")) .forEach(ref -> addReference(ref.trim(), ReferenceType.REFERENCE, notice)); } break; case POSITION: if ("da".equals(lang)) { readLocation(line, lang, notice); } break; case PREVIOUS: if ("da".equals(lang)) { if (line.contains("ajourfrt") || line.contains("ny tid")) { addReference(line, ReferenceType.UPDATE, notice); } else if (line.contains("gentagelse")) { addReference(line, ReferenceType.REPETITION, notice); } else if (line.contains("udgr")) { addReference(line, ReferenceType.CANCELLATION, notice); } } break; case PUBLICATION: desc.setPublication(removeLastPeriod(line)); break; case SOURCE: desc.setSource(line.substring(0, line.length() - 1)); break; case TIME: desc.setTime(line); break; } } /** * Adds a reference of the given type to the notice. * The reference has the format "18/460 2014" * @param ref the reference * @param type the type * @param notice the notice to update */ void addReference(String ref, ReferenceType type, Message notice) { Matcher m = Pattern.compile("[-\\d]+/(\\d+) (\\d+).*").matcher(ref); if (m.matches()) { SeriesIdentifier id = new SeriesIdentifier(); id.setMainType(SeriesIdType.NM); id.setAuthority(organization); id.setNumber(Integer.valueOf(m.group(1))); id.setYear(Integer.valueOf(m.group(2))); Reference reference = new Reference(); reference.setMessage(notice); reference.setType(type); reference.setSeriesIdentifier(id); notice.getReferences().add(reference); } } /** * Reads and updates the area parent hierarchy from the dot-separated list of areas in the line * @param line the line containing the areas * @param lang the language * @param notice the notice to update */ void readArea(String line, String lang, Message notice) { String[] areaNames = line.split("\\."); Area area = null, parent = null; if ("da".equals(lang)) { // Create the areas for (String name : areaNames) { area = new Area(); area.createDesc(lang).setName(name.trim()); area.setParent(parent); parent = area; } notice.setArea(area); } else { // Update the Danish names area = notice.getArea(); for (int x = areaNames.length - 1; x >= 0 && area != null; x--) { area.checkCreateDesc("en").setName(areaNames[x].trim()); area = area.getParent(); } } } /** * Reads the line representing a list of charts and updates the notice accordingly * @param line the line * @param notice the notice to update */ public void readCharts(String line, Message notice) { Pattern p1 = Pattern.compile("(\\d+)"); Pattern p2 = Pattern.compile("(\\d+) \\(INT (\\d+)\\)"); for (String chart : line.split(",")) { Matcher m1 = p1.matcher(chart.trim()); Matcher m2 = p2.matcher(chart.trim()); if (m1.matches()) { notice.getCharts().add(new Chart(m1.group(1), null)); } else if (m2.matches()) { notice.getCharts().add(new Chart(m2.group(1), Integer.parseInt(m2.group(2)))); } } } /** * Reads and updates the locations from the line * @param locLine the line containing the locations * @param lang the language * @param notice the notice to update */ void readLocation(String locLine, String lang, Message notice) { NumberFormat format = NumberFormat.getInstance(Locale.FRANCE); String posPattern = "(\\d+)I?\\s+(\\d+,?\\d+)J?\\s+(N|S)\\s+(\\d+)I?\\s+(\\d+,?\\d+)J?\\s+(E|W),?(.*)"; Pattern p1 = Pattern.compile(posPattern); Pattern p2 = Pattern.compile("(\\d+)\\)\\s+" + posPattern); Location location = new Location(); String[] lines = locLine.split("\n"); for (int x = 0; x < lines.length; x++) { String line = lines[x].trim(); if (line.endsWith(".")) { line = line.substring(0, line.length() - 1); } Matcher m1 = p1.matcher(line); Matcher m2 = p2.matcher(line); if (m1.matches() || m2.matches()) { Point pt = new Point(); pt.setLocation(location); try { int i = 1; Matcher m; if (m1.matches()) { m = m1; pt.setIndex(x + 1); } else { m = m2; pt.setIndex(Integer.valueOf(m.group(i++))); } pt.setLat(parsePos(Integer.parseInt(m.group(i++)), format.parse(m.group(i++)).doubleValue(), m.group(i++))); pt.setLon(parsePos(Integer.parseInt(m.group(i++)), format.parse(m.group(i++)).doubleValue(), m.group(i++))); String desc = m.group(i).trim(); if (StringUtils.isNotBlank(desc)) { pt.checkCreateDesc(lang).setDescription(desc); } } catch (ParseException e) { e.printStackTrace(); } location.getPoints().add(pt); } else { log.warn("No match " + lines[x]); } } if (location.getPoints().size() > 0) { location.setType(location.getPoints().size() == 1 ? Location.LocationType.POINT : (location.getPoints().size() == 2 ? Location.LocationType.POLYLINE : Location.LocationType.POLYGON)); notice.getLocations().add(location); } } public static double parsePos(int h, double m, String pos) { return h + m / 60.0 * (pos.equalsIgnoreCase("S") || pos.equalsIgnoreCase("W") ? -1 : 1); } /** * Looks at the prefix of the list to determine the type * @param line the line to parse * @param lang the language * @return the tuple of the type and remaining part of the line */ private LinePart<Line> readLineType(String line, String lang) { for (Line lineType : Line.values()) { String prefix = lineType.getName(lang); if (line.trim().startsWith(prefix)) { return new LinePart<>(lineType, line.substring(prefix.length()).trim()); } } return new LinePart<>(null, line); } /** * Splits the line into the part before and after the first period * @param line the line to split * @return the two parts of the line before and after the first period */ private LinePart<String> readFirstPart(String line) { int i = line.indexOf("."); if (i >= 0) { return new LinePart<>(line.substring(0, i).trim(), i == line.length() - 1 ? "" : line.substring(i + 1).trim()); } return new LinePart<>(line, ""); } /** * Removes any trailing period from the line * @param line the line * @return the line excluding any trailing period */ private String removeLastPeriod(String line) { line = line.trim(); if (line.endsWith(".")) { line = line.substring(0, line.length() - 1); } return line; } }