Java tutorial
/** * */ package com.iana.dver.pdf.scrapper; import java.awt.Rectangle; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.Collections; import java.util.List; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.comparator.LastModifiedFileComparator; import org.apache.log4j.Logger; import org.jdom2.Document; import org.jdom2.Element; import org.jdom2.output.Format; import org.jdom2.output.XMLOutputter; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDPage; import org.pdfbox.util.PDFTextStripperByArea; import org.springframework.integration.file.DirectoryScanner; import org.springframework.integration.file.HeadDirectoryScanner; import com.iana.dver.pdf.scrapper.exception.WrongConfigurationException; /** * @author tgbaxi */ public class DVERScrapperTask { private static final Logger log = Logger.getLogger(DVERScrapperTask.class); private String scanDir; private String archiveDir; private String maxFiles; private String xmlDir; public void readAndParseDVER() throws WrongConfigurationException, IOException { if (scanDir == null && archiveDir == null && maxFiles == null && xmlDir == null) { log.info("Wrong configuration used"); throw new WrongConfigurationException("Some configuration is missing in job"); } // Fetching the files up to 50 DirectoryScanner scanner = new HeadDirectoryScanner(Integer.parseInt(maxFiles.trim())); File dverScanDirectory = new File(scanDir); if (dverScanDirectory.isDirectory()) { List<File> files = scanner.listFiles(dverScanDirectory); // Sorting the list as per created time. Collections.sort(files, LastModifiedFileComparator.LASTMODIFIED_COMPARATOR); // Iterate the files list for (File file : files) { if (file.isFile()) { PDFTextStripperByArea textStripper = readDVER(file); generateDverXML(FilenameUtils.getBaseName(file.getName()), textStripper); } } } else { throw new WrongConfigurationException("The Path to look for DVER is not a directory"); } } /** * Step - 1 : Read PDF from the path * * @param file * @return * @throws IOException */ @SuppressWarnings("unchecked") private PDFTextStripperByArea readDVER(final File file) throws IOException { PDDocument document = PDDocument.load(file); PDFTextStripperByArea textStripper = new PDFTextStripperByArea(); Rectangle addressRect = new Rectangle(10, 50, 200, 50); textStripper.addRegion("ADDRESS", addressRect); Rectangle reportInfoRect = new Rectangle(300, 50, 300, 50); textStripper.addRegion("REPORT_INFO", reportInfoRect); Rectangle iepRect = new Rectangle(10, 100, 630, 40); textStripper.addRegion("IEP_INFO", iepRect); Rectangle mcRect = new Rectangle(10, 140, 630, 80); textStripper.addRegion("MC_INFO", mcRect); Rectangle driverQueRect = new Rectangle(10, 220, 630, 20); textStripper.addRegion("DRIVER_CHOICE", driverQueRect); Rectangle vehicleIdRect = new Rectangle(10, 240, 630, 30); textStripper.addRegion("VEHICLE_ID", vehicleIdRect); Rectangle brkAdjRect = new Rectangle(10, 270, 630, 20); textStripper.addRegion("BREAK_ADJ", brkAdjRect); Rectangle violationRect = new Rectangle(10, 290, 630, 30); textStripper.addRegion("CHASSIS_VIOLATION", violationRect); Rectangle otherViolationRect = new Rectangle(10, 320, 630, 50); textStripper.addRegion("OTHER_CHASSIS_VIOLATION", otherViolationRect); Rectangle driverNotesRect = new Rectangle(10, 370, 630, 30); textStripper.addRegion("DRIVER_NOTES", driverNotesRect); Rectangle iepNotesRect = new Rectangle(10, 400, 630, 60); textStripper.addRegion("IEP_NOTES", iepNotesRect); Rectangle dverCreationRect = new Rectangle(10, 720, 630, 60); textStripper.addRegion("CREATION_NOTES", dverCreationRect); List<PDPage> allPages = document.getDocumentCatalog().getAllPages(); PDPage firstPage = allPages.get(0); textStripper.extractRegions(firstPage); return textStripper; } /** * Step - 2 : Generate XML from Text stripper * * @param tempTextFile * @throws IOException */ private void generateDverXML(String fileName, PDFTextStripperByArea stripper) throws IOException { File outputFile = new File(xmlDir + fileName + ".xml"); OutputStream fos = new FileOutputStream(outputFile); Element dver = new Element("DVER"); Document doc = new Document(dver); // Generate Address Node String addressDetail = stripper.getTextForRegion("ADDRESS"); String[] addressArr = addressDetail.split("\\n"); Element addressNode = new Element("ADDRESS"); addressNode.addContent(new Element("ADDRESS_1").setText(addressArr[0] + "\\n" + addressArr[1])); addressNode.addContent(new Element("ADDRESS_2").setText(addressArr[2] + "\\n" + addressArr[3])); if (addressArr.length > 4) { String[] tempContact = addressArr[4].split(" "); addressNode.addContent(new Element("PHONE").setText(tempContact[1])); addressNode.addContent(new Element("FAX").setText(tempContact[3])); } else { addressNode.addContent(new Element("PHONE").setText("")); addressNode.addContent(new Element("FAX").setText("")); } doc.getRootElement().addContent(addressNode); // Report Information Node String reportDetail = stripper.getTextForRegion("REPORT_INFO"); String[] reportDetailArr = reportDetail.split("\\n"); Element reportInfoNode = new Element("REPORT_INFO"); for (int i = 0; i < reportDetailArr.length; i++) { if (i == 0) { String[] reportInfo = reportDetailArr[i].split(":"); reportInfoNode.addContent(new Element("REPORT_NUMBER").setText(reportInfo[1])); } else if (i == 1) { String[] inspDetail = reportDetailArr[i].split(":"); inspDetail[1] = inspDetail[1].replaceAll("Certification Date", ""); reportInfoNode.addContent(new Element("INSPECTION_DATE").setText(inspDetail[1])); reportInfoNode.addContent(new Element("CERTIFICATION_DATE").setText(inspDetail[2])); } else if (i == 2) { String timings = reportDetailArr[i]; timings = timings.replaceAll("Time Started:", ""); timings = timings.replaceAll("Time Ended:", ""); String[] timeDetail = timings.split(" "); reportInfoNode.addContent(new Element("START_TIME").setText(timeDetail[0])); reportInfoNode.addContent(new Element("END_TIME").setText(timeDetail[1])); } else if (i == 3) { String[] reportInfo = reportDetailArr[i].split(":"); reportInfoNode.addContent(new Element("INSPECTION_LEVEL").setText(reportInfo[1])); } else if (i == 4) { String[] reportInfo = reportDetailArr[i].split(":"); reportInfoNode.addContent(new Element("INSPECTION_TYPE").setText(reportInfo[1])); } } doc.getRootElement().addContent(reportInfoNode); // INTERMODAL EQUIPMENT PROVIDER INFORMATION String iepDetail = stripper.getTextForRegion("IEP_INFO"); String[] iepDetailArr = iepDetail.split("\\n"); Element iepInfoNode = new Element("IEP_INFO"); for (int j = 0; j < iepDetailArr.length; j++) { if (j == 1) { iepInfoNode.addContent(new Element("IEP_NAME").setText(iepDetailArr[j])); } else if (j == 2) { String[] tempIepInfo = iepDetailArr[j].split(" "); iepInfoNode.addContent(new Element("US_DOT").setText(tempIepInfo[3])); iepInfoNode.addContent(new Element("DATA_SOURCE").setText(tempIepInfo[6])); } } doc.getRootElement().addContent(iepInfoNode); // MOTOR CARRIER INFORMATION String mcDetail = stripper.getTextForRegion("MC_INFO"); String[] mcDetailArr = mcDetail.split("\\n"); Element mcDetailNode = new Element("MC_INFO"); for (int k = 0; k < mcDetailArr.length; k++) { if (k == 1) { String mcCompAndDriver = mcDetailArr[k].replaceAll("Driver:", ""); mcDetailNode.addContent(new Element("MC_NAME").setText(mcCompAndDriver.split(" ")[0])); mcDetailNode.addContent(new Element("DRIVER").setText(mcCompAndDriver.split(" ")[1])); } else if (k == 2) { mcDetailNode.addContent(new Element("MC_ADD_1").setText(mcDetailArr[k])); } else if (k == 3) { mcDetailNode.addContent(new Element("MC_ADD_2").setText(mcDetailArr[k])); } else if (k == 4) { String tempStr = mcDetailArr[k]; tempStr = tempStr.replaceAll("USDOT #:", ""); tempStr = tempStr.replaceAll("Phone #:", ""); String[] otherDetails = tempStr.trim().split(" "); mcDetailNode .addContent(new Element("US_DOT").setText(otherDetails[0] != null ? otherDetails[0] : "")); mcDetailNode .addContent(new Element("PHONE").setText(otherDetails[2] != null ? otherDetails[2] : "")); } else if (k == 5) { String tempStr = mcDetailArr[k]; tempStr = tempStr.replaceAll("MC/MX #:", ""); tempStr = tempStr.replaceAll("Fax #:", ""); String[] otherDetails = tempStr.trim().split(" "); mcDetailNode .addContent(new Element("MC_MX").setText(otherDetails[0] != null ? otherDetails[0] : "")); mcDetailNode.addContent(new Element("FAX") .setText(otherDetails.length > 1 && otherDetails[1] != null ? otherDetails[2] : "")); } else if (k == 6) { mcDetailArr[k] = mcDetailArr[k].replaceAll("State #:", ""); mcDetailNode.addContent(new Element("STATE").setText(mcDetailArr[k] != null ? mcDetailArr[k] : "")); } else if (k == 7) { mcDetailArr[k] = mcDetailArr[k].replaceAll("Origin:", ""); mcDetailArr[k] = mcDetailArr[k].replaceAll("Destination:", ""); mcDetailNode.addContent( new Element("ORIGIN_DESTINATION").setText(mcDetailArr[k] != null ? mcDetailArr[k] : "")); } } doc.getRootElement().addContent(mcDetailNode); // VEHICLE IDENTIFICATION String vehicleIdentification = stripper.getTextForRegion("VEHICLE_ID"); String[] vehicleIdArr = vehicleIdentification.split("\\n"); Element vehicleIdNode = new Element("VEHICLE_IDENTIFICATION"); for (int l = 0; l < vehicleIdArr.length; l++) { if (l == 2) { String[] vehicleDetails = vehicleIdArr[l].trim().split(" "); for (int m = 0; m < vehicleDetails.length; m++) { if (m == 0) { vehicleIdNode.addContent( new Element("UNIT").setText(vehicleDetails[m] != null ? vehicleDetails[m] : "")); } else if (m == 1) { vehicleIdNode.addContent( new Element("TYPE").setText(vehicleDetails[m] != null ? vehicleDetails[m] : "")); } else if (m == 2) { vehicleIdNode.addContent( new Element("MAKE").setText(vehicleDetails[m] != null ? vehicleDetails[m] : "")); } else if (m == 3) { vehicleIdNode.addContent( new Element("YEAR").setText(vehicleDetails[m] != null ? vehicleDetails[m] : "")); } else if (m == 4) { vehicleIdNode.addContent( new Element("STATE").setText(vehicleDetails[m] != null ? vehicleDetails[m] : "")); } else if (m == 5) { vehicleIdNode.addContent( new Element("LICENSE").setText(vehicleDetails[m] != null ? vehicleDetails[m] : "")); } else if (m == 6) { vehicleIdNode.addContent(new Element("EQUIPMENT_ID") .setText(vehicleDetails[m] != null ? vehicleDetails[m] : "")); } else if (m == 7) { vehicleIdNode.addContent(new Element("UNIT_VIN") .setText(vehicleDetails[m] != null ? vehicleDetails[m] : "")); } else if (m == 8) { vehicleIdNode.addContent( new Element("GVWR").setText(vehicleDetails[m] != null ? vehicleDetails[m] : "")); } else if (m == 9) { vehicleIdNode.addContent(new Element("ISSUED_DECAL") .setText(vehicleDetails[m] != null ? vehicleDetails[m] : "")); } else if (m == 10) { vehicleIdNode.addContent(new Element("OOS_STKR") .setText(vehicleDetails[m] != null ? vehicleDetails[m] : "")); } } } } doc.getRootElement().addContent(vehicleIdNode); // Brake Adjustments String breakAdjustment = stripper.getTextForRegion("BREAK_ADJ"); String[] breakAdjustmentArr = breakAdjustment.split("-"); Element breakAdjustmentNode = new Element("BREAK_ADJUSTMENT"); for (int n = 0; n < breakAdjustmentArr.length; n++) { if (n == 1) { breakAdjustmentNode.setText(breakAdjustmentArr[n] != null ? breakAdjustmentArr[n] : ""); } } doc.getRootElement().addContent(breakAdjustmentNode); // Other Chassis Violation details String otherViolationDetail = stripper.getTextForRegion("OTHER_CHASSIS_VIOLATION"); String[] otherViolationDetailArr = otherViolationDetail.split("\\n"); Element otherViolationElement = new Element("OTHER_CHASSIS_VIOLATION"); for (int ocnt = 0; ocnt < (otherViolationDetailArr.length - 1); ocnt++) { if (ocnt > 1) { String[] tempOtrDetail = otherViolationDetailArr[ocnt].split(" "); Element violations = new Element("OTHER_VIOLATIONS"); for (int temp = 0; temp < tempOtrDetail.length; temp++) { if (temp == 0) { violations.addContent(new Element("VIO_CODE").setText(tempOtrDetail[temp])); } else if (temp == 1) { violations.addContent(new Element("SECTION").setText(tempOtrDetail[temp])); } else if (temp == 2) { violations.addContent(new Element("UNIT").setText(tempOtrDetail[temp])); } else if (temp == 3) { violations.addContent(new Element("OOS").setText(tempOtrDetail[temp])); } else if (temp == 4) { violations.addContent(new Element("NUMBER").setText(tempOtrDetail[temp])); } else if (temp == 5) { violations.addContent(new Element("VERIFY").setText(tempOtrDetail[temp])); } else if (temp == 6) { violations.addContent(new Element("CRASH").setText(tempOtrDetail[temp])); } else if (temp == 7) { violations.addContent(new Element("VIO_DESC").setText(tempOtrDetail[temp])); } } otherViolationElement.addContent(violations); } } doc.getRootElement().addContent(otherViolationElement); String driverNotes = stripper.getTextForRegion("DRIVER_NOTES"); Element driverNotesNode = new Element("NOTES_TO_DRIVER"); driverNotesNode.setText(driverNotes); doc.getRootElement().addContent(driverNotesNode); String iepNotes = stripper.getTextForRegion("IEP_NOTES"); Element iepNotesNode = new Element("NOTES_TO_IEP"); iepNotesNode.setText(iepNotes); doc.getRootElement().addContent(iepNotesNode); String creationNotes = stripper.getTextForRegion("CREATION_NOTES"); Element creationNotesNode = new Element("CREATED_BY"); creationNotesNode.setText(creationNotes.split("\\n")[1]); doc.getRootElement().addContent(creationNotesNode); XMLOutputter xmlOutput = new XMLOutputter(); // display nice nice xmlOutput.setFormat(Format.getPrettyFormat()); xmlOutput.output(doc, fos); fos.flush(); fos.close(); } public String getScanDir() { return scanDir; } public void setScanDir(String scanDir) { this.scanDir = scanDir; } public String getArchiveDir() { return archiveDir; } public void setArchiveDir(String archiveDir) { this.archiveDir = archiveDir; } public String getMaxFiles() { return maxFiles; } public void setMaxFiles(String maxFiles) { this.maxFiles = maxFiles; } public String getXmlDir() { return xmlDir; } public void setXmlDir(String xmlDir) { this.xmlDir = xmlDir; } }