Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package org.shareok.data.lawlibrary; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.shareok.data.documentProcessor.CsvHandler; import org.shareok.data.documentProcessor.DocumentProcessorUtil; import org.shareok.data.lawlibrary.exceptions.DateReformatException; import org.springframework.beans.factory.annotation.Autowired; import safbuilder.SAFPackage; /** * * @author Tao Zhao */ public class LawLibDataHandlerImpl implements LawLibDataHandler { // private static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(LawLibDataHandlerImpl.class); private static final String SERIAL_A_REGX = "^(.*)(-Serial-)(\\d+).pdf$"; public static final String[] COULUMNS_TO_BE_DELETED = { "Maps", "Fold-Out Charts", "collection name (repeatable)", "Special Collection (repeatable)", "rights (repeatable)", "media_type (repeatable)", "file_format (repeatable)" }; public static final Map<String, String> COLUMN_NAME_MAP_METADATA_SCHEMA; static { COLUMN_NAME_MAP_METADATA_SCHEMA = new HashMap<>(); COLUMN_NAME_MAP_METADATA_SCHEMA.put("Document Title", "dc.title"); COLUMN_NAME_MAP_METADATA_SCHEMA.put("Congress-Session", "dc.description.congressSession"); COLUMN_NAME_MAP_METADATA_SCHEMA.put("Number of pages", "dcterms.extent"); COLUMN_NAME_MAP_METADATA_SCHEMA.put("Official treaty name (title-alternative)", "dcterms.alternative.treaty"); COLUMN_NAME_MAP_METADATA_SCHEMA.put("Serial Set Id", "dc.identifier"); COLUMN_NAME_MAP_METADATA_SCHEMA.put("Document Date", "dcterms.issued"); COLUMN_NAME_MAP_METADATA_SCHEMA.put("Committee", "dc.description.committee"); COLUMN_NAME_MAP_METADATA_SCHEMA.put("Descriptive Title", "dcterms.alternative.title"); COLUMN_NAME_MAP_METADATA_SCHEMA.put("Cross Reference - Other House or Senate", "dc.description.otherSerialSetID"); COLUMN_NAME_MAP_METADATA_SCHEMA.put("Johnson's Bib reference", "dc.description.johnsonReference"); COLUMN_NAME_MAP_METADATA_SCHEMA.put("Johnson Annotation", "dc.description.johnsonAnnotation"); COLUMN_NAME_MAP_METADATA_SCHEMA.put("Notes (hidden)", "dc.description.notes"); COLUMN_NAME_MAP_METADATA_SCHEMA.put("file_location", "filename"); } //csv.setFileName("/Users/zhao0677/Projects/law-library/pdf/load-test.csv"); private CsvHandler csv; private String inputFilePath; private String outputFilePath; private String outputCsvFilePath; private List<String> pdfFileList = new ArrayList<>(); private List<String> matchedPdfFileList = new ArrayList<>(); private Map data; public CsvHandler getCsv() { return csv; } public String getInputFilePath() { return inputFilePath; } public String getOutputFilePath() { return outputFilePath; } public List<String> getPdfFileList() { return pdfFileList; } public List<String> getMatchedPdfFileList() { return matchedPdfFileList; } @Override public Map getData() { return data; } public String getOutputCsvFilePath() { return outputCsvFilePath; } @Autowired public void setCsv(CsvHandler csv) { this.csv = csv; } public void setInputFilePath(String inputFilePath) { this.inputFilePath = inputFilePath; } public void setOutputFilePath(String outputFilePath) { this.outputFilePath = outputFilePath; } public void setPdfFileList(List<String> pdfFileList) { this.pdfFileList = pdfFileList; } public void setData(Map data) { this.data = data; } public void setOutputCsvFilePath(String outputCsvFilePath) { this.outputCsvFilePath = outputCsvFilePath; } public void setMatchedPdfFileList(List<String> matchedPdfFileList) { this.matchedPdfFileList = matchedPdfFileList; } @Override public void readSourceData() { if (null == data) { if (null == csv.getFileName() || "".equals(csv.getFileName())) { csv.setFileName(inputFilePath); } csv.readData(); data = csv.getData(); } cleanData(); } @Override public void outputMetaData() { throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. } /** * The data provided by the Law Library have some problems: * 1. The PDF file names do NOT always match file names listed in the CSV file * 2. The file extension names are inconsistent in terms of capitalization * 3. Format the existing dates to be ISO-8601 format * 4. Remove some columns in the data file as they either have no data or are not recognized by the metadata schema * 5. Translate the columns into correct metadata schema * 6. When the item has no title, use the dcterms.alternative as dc.title * 7. Change the column name File_location to be filename for the SAFBuilder * * The solutions: * 1. Match the file names in the CSV file with the existing files in the folder * 2. All the file names have extension like .pdf * 4. Remove the unnecessary columns * 5. Convert the normal columns into correct metadata schema * 6. Cope the data to the column of dc.title and clear this column * 7. Just change it * */ private void cleanData() { if (null == data) { csv.readData(); data = csv.getData(); } try { if (null == pdfFileList || pdfFileList.size() == 0) { getPdfFileListFromCleanedOutputPathFiles(); } // Remove the unused columns csv.deleteColumnByColumnName(COULUMNS_TO_BE_DELETED); // Match the pdf files with the file names in the csv file // Only keep the data records that have the PDF files // Also update the column names to the metadata schema // At the same time, update the date format of the records Map<String, String> cleanData = new HashMap<>(); String newKey = ""; String value = ""; int newRecordCount = 0; for (int i = 1; i < csv.getRecordCount(); i++) { String key = "file_location-" + String.valueOf(i); String csvFileName = DocumentProcessorUtil.getFileNameWithoutExtension((String) data.get(key)); if (csvFileName.toLowerCase().endsWith(".pdf")) { csvFileName = csvFileName.replace(".pdf", ""); csvFileName = csvFileName.replace(".PDF", ""); } // 837 9004 kim csvFileName += ".pdf"; // Process the files that have the "-Serial-" pattern when the actual PDF files end with ***A.PDF String csvFileNameWithA = matchSerialAFiles(csvFileName); data.put("file_location-" + String.valueOf(i), csvFileName); if (pdfFileList.contains(csvFileName) || pdfFileList.contains(csvFileNameWithA)) { String fileInfo = ""; if (pdfFileList.contains(csvFileName)) { matchedPdfFileList.add(csvFileName); fileInfo += csvFileName; newRecordCount++; } if (pdfFileList.contains(csvFileNameWithA)) { matchedPdfFileList.add(csvFileNameWithA); if ("".equals(fileInfo)) { fileInfo += csvFileNameWithA; } else { fileInfo += "||" + csvFileNameWithA; } newRecordCount++; } for (String column : csv.getFileHeadMapping()) { if (null == column || column.equals("")) { continue; } else { column = column.trim(); key = column + "-" + String.valueOf(i); value = (String) data.get(key); // if(null == value || "null" == value){ // System.out.println("null value for paper "+csvFileName+" with column "+column); // } String dcTerm = (String) (COLUMN_NAME_MAP_METADATA_SCHEMA.get(column)); newKey = dcTerm + "-" + String.valueOf(i); if (column.equals("Document Date")) { value = changeDataFormat(value); } else if (column.contains("Document Title") && value.contains("Document not titled")) { value = (String) data .get("Official treaty name (title-alternative)-" + String.valueOf(i)); data.put("Official treaty name (title-alternative)-" + String.valueOf(i), ""); } else if (column.contains("file_location")) { value = fileInfo; } cleanData.put(newKey, value); } } } } setData(cleanData); csv.setData((HashMap) cleanData); String[] newHeadingsArray = COLUMN_NAME_MAP_METADATA_SCHEMA.values() .toArray(new String[COLUMN_NAME_MAP_METADATA_SCHEMA.values().size()]); csv.setFileHeadMapping(newHeadingsArray); csv.setRecordCount(newRecordCount); outputCsvFilePath = csv.outputData(outputFilePath + File.separator + "metadata.csv"); DocumentProcessorUtil.outputStringToFile(String.join("\n", matchedPdfFileList), new File(outputFilePath).getPath() + File.separator + "matchedPdfFiles.txt"); DocumentProcessorUtil.outputStringToFile(String.join("\n", getUnmatchedFileList()), new File(outputFilePath).getPath() + File.separator + "unmatchedPdfFiles.txt"); } catch (Exception ex) { logger.error("Cannot clean up the data.", ex); } } // The existing date format is mm-dd-yyyy, now changes it to be yyyy-mm-dd private String changeDataFormat(String date) { try { String[] dateInfo = date.split("-"); if (null == dateInfo || dateInfo.length != 3) { throw new DateReformatException("The date is not in the form of mm-dd-yyyy"); } return dateInfo[2] + "-" + dateInfo[0] + "-" + dateInfo[1]; } catch (DateReformatException ex) { logger.error("Cannot reformat the date", ex); } return null; } private List<String> getUnmatchedFileList() { List<String> unmatchedFileList = new ArrayList<String>(); for (String fileName : pdfFileList) { unmatchedFileList.add(fileName); } unmatchedFileList.removeAll(matchedPdfFileList); return unmatchedFileList; } /** * Many files have name containing XXX-XXX-XXX-Serial-8888A.pdf, e.g. "Senate-46-2-Miscellaneous-20-Serial-1890A.pdf" * In the Excel file where the corresponding file name does not have the A after the number following the "-Serial-" pattern * The regular expression pattern is used to identify these files * * @param file : the name of the actual PDF file to be matched from the file list of pdf files names */ public String matchSerialAFiles(String fileName) { Pattern p = Pattern.compile(SERIAL_A_REGX); Matcher m = p.matcher(fileName); if (m.find()) { fileName = fileName.substring(0, fileName.length() - 4) + "A.pdf"; } return fileName; } public void getPdfFileListFromTextFile(String filePath) { setPdfFileList(DocumentProcessorUtil.readTextFileIntoList(filePath)); } public void getMatchedFileListFromTextFile(String filePath) { setMatchedPdfFileList(DocumentProcessorUtil.readTextFileIntoList(filePath)); } /** * Many pdf files end with .pdf.pdf; in addition, the uppercase and lowercase are messy */ public void getPdfFileListFromCleanedOutputPathFiles() { File pdfFolder = new File(outputFilePath); File[] pdfFlist = pdfFolder.listFiles(); for (File file : pdfFlist) { String name = file.getName(); String parent = file.getParent(); if (file.getPath().toLowerCase().endsWith(".pdf")) { String nameWithoutExtension = DocumentProcessorUtil.getFileNameWithoutExtension(name); if (nameWithoutExtension.toLowerCase().endsWith(".pdf")) { nameWithoutExtension = nameWithoutExtension.replaceAll(".pdf", ""); nameWithoutExtension = nameWithoutExtension.replaceAll(".PDF", ""); } pdfFileList.add(nameWithoutExtension + ".pdf"); file.renameTo(new File(parent + File.separator + nameWithoutExtension + ".pdf")); //file.renameTo(new File(".PDF")); } } } public void outputSafPackage() { try { String csvPath = outputFilePath + File.separator + "metadata.csv"; SAFPackage safPackageInstance = new SAFPackage(); safPackageInstance.generateManifest(csvPath); safPackageInstance.processMetaPack(csvPath, true); } catch (IOException ex) { logger.error("Cannot generate the SAF package", ex); } } }