Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package eu.transkribus.languageresources.extractor.pagexml; import java.io.File; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import eu.transkribus.interfaces.IDictionary; import eu.transkribus.languageresources.extractor.pagexml.annotations.PAGEXMLAbbreviation; import eu.transkribus.languageresources.extractor.pagexml.annotations.PAGEXMLAnnotation; import eu.transkribus.languageresources.extractor.pagexml.annotations.PAGEXMLIndex; import eu.transkribus.languageresources.extractor.pagexml.tagextractor.AbbreviationsBuilder; import eu.transkribus.languageresources.extractor.pagexml.tagextractor.PAGEXMLValueBuilder; import eu.transkribus.languageresources.extractor.pagexml.tagextractor.SimpleBuilder; import eu.transkribus.languageresources.extractor.pagexml.tagextractor.TokenBuilder; import eu.transkribus.languageresources.extractor.xml.XMLExtractor; import eu.transkribus.interfaces.languageresources.IPagewiseTextExtractor; import eu.transkribus.languageresources.util.PAGEFileComparator; import java.util.Arrays; import java.util.Collection; import org.apache.commons.math3.util.Pair; /** * * @author max */ public class PAGEXMLExtractor extends XMLExtractor implements IPagewiseTextExtractor { // Patterns do take time to compile, thus make them // static for performance reasons. private static Pattern patternAbbrev = Pattern.compile("abbrev\\s*\\{([\\w-,:;\\s]*)\\}"); private static Pattern patternAbbrevExpansion = Pattern.compile(".*expansion:([\\w-]*);.*"); private static Pattern patternPlace = Pattern.compile("place\\s*\\{([\\w-,:;\\s]*)\\}"); private static Pattern patternOffset = Pattern.compile(".*offset:([0-9]*);.*"); private static Pattern patternLength = Pattern.compile(".*length:([0-9]*);.*"); public PAGEXMLExtractor() { super(); } public PAGEXMLExtractor(String pathToConfig) { super(new File(pathToConfig)); } public PAGEXMLExtractor(File configFile) { super(configFile); } @Override public String extractTextFromDocument(String path) { return extractTextFromDocument(path, "\n"); } @Override public String extractTextFromDocument(String path, String splitCharacter) { List<File> files = getFileList(path); return extractTextFromFileList(files, splitCharacter); } public String extractTextFromFileList(List<File> files, String splitCharacter) { StringBuilder content = new StringBuilder(); for (int fileIndex = 0; fileIndex < files.size(); fileIndex++) { content.append(getTextFromFile(files.get(fileIndex))); if (fileIndex + 1 < files.size()) { content.append(splitCharacter); } } return content.toString(); } @Override public List<String> extractTextFromDocumentPagewise(String path) { List<String> content = new ArrayList<>(); List<File> files = getFileList(path); for (File f : files) { content.add(getTextFromFile(f)); } return content; } @Override public String extractTextFromPage(String path, int page) { List<File> files = getFileList(path); return getTextFromFile(files.get(page)); } private List<File> getFileList(String path) { if (!path.endsWith("/page")) { path += "/page"; } File folder = new File(path); List<String> fileNames = new ArrayList<>(); for (File f : folder.listFiles((File pathname) -> pathname.getName().endsWith(".xml"))) { fileNames.add(f.getName()); } Collections.sort(fileNames, new PAGEFileComparator()); List<File> files = new ArrayList<>(); for (String fileName : fileNames) { files.add(new File(path + "/" + fileName)); } // files = files.subList(0, 3); return files; } private List<Node> getNodesFromXml(Document doc) { return getNodesFromXml(doc, "Unicode"); } private List<Node> getNodesFromXml(Document doc, String tagName) { try { NodeList nodeList = doc.getElementsByTagName(tagName); List<Node> unicodeNodes = new LinkedList<>(); for (int i = 0; i < nodeList.getLength(); i++) { if (nodeList.item(i).getParentNode().getParentNode().getNodeName().equals("TextLine")) { unicodeNodes.add(nodeList.item(i)); } } return unicodeNodes; } catch (Exception ex) { System.out.println(ex); throw new RuntimeException("Could not load nodes from file!"); } } private List<Node> getNodesFromFile(File f) { try { DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(f); return getNodesFromXml(doc); } catch (Exception ex) { System.out.println(ex); throw new RuntimeException("Could not load nodes from file!"); } } protected String getTextFromXml(Document d) { List<Node> unicodeNodes = getNodesFromXml(d); return getTextFromNodeList(unicodeNodes); } public String getTextFromFile(File f) { List<Node> unicodeNodes = getNodesFromFile(f); return getTextFromNodeList(unicodeNodes); } private String getTextFromNodeList(List<Node> unicodeNodes) { StringBuilder content = new StringBuilder(); Node unicodeNode; int numNodes = unicodeNodes.size(); for (int nodeIndex = 0; nodeIndex < numNodes; nodeIndex++) { unicodeNode = unicodeNodes.get(nodeIndex); content.append(getTextFromNode(unicodeNode)); if (nodeIndex + 1 < numNodes) { content.append("\n"); } } return content.toString(); } public String getTextFromNode(Node unicodeNode) { String textContent = unicodeNode.getTextContent(); Node textLineNode = unicodeNode.getParentNode().getParentNode(); Node customNode = textLineNode.getAttributes().getNamedItem("custom"); String customTagValue = null; if (customNode != null) { customTagValue = textLineNode.getAttributes().getNamedItem("custom").getTextContent(); } return getTextFromNode(textContent, customTagValue); } public String getTextFromNode(String textContent, String customTagValue) { String abbreviationMode = properties.getProperty("abbreviation_expansion_mode", "keep"); return getTextFromNode(textContent, customTagValue, abbreviationMode); } public String getTextFromNode(String textContent, String customTagValue, String abbreviationExpansionMode) { if (!abbreviationExpansionMode.equals("keep") && !abbreviationExpansionMode.equals("expand")) { throw new IllegalArgumentException( "Unkown mode, abbreviationExpansionMode has to be 'keep' or 'expand'"); } if (abbreviationExpansionMode.equals("expand")) { textContent = expandAbbreviations(textContent, customTagValue); } return textContent; } public String expandAbbreviations(String textContent, String customTagValue) { return expandAbbreviations(textContent, customTagValue, 0, 0); } public String expandAbbreviations(String textContent, String customTagValue, int pageIndex, int lineIndex) { List<PAGEXMLAbbreviation> abbreviations = this.<PAGEXMLAbbreviation>extractValueFromLine(new LinkedList<>(), textContent, customTagValue, new AbbreviationsBuilder(), pageIndex, lineIndex); String partLeft; String partRight; int startLeft; int expansionDiffSum = 0; for (PAGEXMLAbbreviation abbreviation : abbreviations) { if (abbreviation.getExpansion() != null) { startLeft = abbreviation.getStartOffset() + expansionDiffSum; partLeft = textContent.substring(0, startLeft); partRight = textContent.substring(startLeft + abbreviation.getLength(), textContent.length()); textContent = partLeft + abbreviation.getExpansion() + partRight; expansionDiffSum += abbreviation.getLengthDiff(); } } return textContent; } protected <V extends PAGEXMLAnnotation> List<V> extractValueFromLine(List<V> list, String line, String customTagValue, PAGEXMLValueBuilder builder, int pageIndex, int lineIndex) { return builder.extract(list, line, customTagValue, pageIndex, lineIndex); } @Override public IDictionary extractAbbreviations(String path) { return extractValuesAsDict(path, new AbbreviationsBuilder()); } @Override public IDictionary extractPlaceNames(String path) { return extractValuesAsDict(path, new SimpleBuilder("place")); } @Override public IDictionary extractPersonNames(String path) { return extractValuesAsDict(path, new SimpleBuilder("person")); } public IDictionary extractOrganizations(String path) { return extractValuesAsDict(path, new SimpleBuilder("organization")); } public List<PAGEXMLAbbreviation> extractAbbreviationsAsList(String path) { return extractValues(path, new AbbreviationsBuilder()); } public List<PAGEXMLAnnotation> extractPlaceNamesAsList(String path) { return extractValues(path, new SimpleBuilder("place")); } public List<PAGEXMLAnnotation> extractPersonNamesAsList(String path) { return extractValues(path, new SimpleBuilder("person")); } public List<PAGEXMLAnnotation> extractOrganizationsAsList(String path) { return extractValues(path, new SimpleBuilder("organization")); } private <A extends PAGEXMLAnnotation> IDictionary extractValuesAsDict(String path, PAGEXMLValueBuilder builder) { return builder.toDictionary(extractValues(path, builder)); } private <A extends PAGEXMLAnnotation> List<A> extractValues(String path, PAGEXMLValueBuilder builder) { List<A> values = new LinkedList<>(); List<File> files = getFileList(path); for (int fileIndex = 0; fileIndex < files.size(); fileIndex++) { values = extractValuesFromPage(values, files.get(fileIndex), builder, fileIndex); } return values; } @Override public IDictionary extractAbbreviationsFromPage(String path, int page) { AbbreviationsBuilder builder = new AbbreviationsBuilder(); List<PAGEXMLAbbreviation> abbrevations = this.<PAGEXMLAbbreviation>extractValuesFromPage(new LinkedList<>(), path, page, builder); return builder.toDictionary(abbrevations); } private <A extends PAGEXMLAnnotation> List<A> extractValuesFromPage(List<A> list, String path, int page, PAGEXMLValueBuilder builder) { File file = getFileList(path).get(page); return extractValuesFromPage(list, file, builder, page); } private <A extends PAGEXMLAnnotation> List<A> extractValuesFromPage(List<A> list, File file, PAGEXMLValueBuilder builder, int pageIndex) { List<Node> unicodeNodes = getNodesFromFile(file); for (int i = 0; i < unicodeNodes.size(); i++) { if (unicodeNodes.get(i).getParentNode().getParentNode().getNodeName().equals("TextLine")) { list = extractValuesFromLine(list, unicodeNodes.get(i), builder, pageIndex, i); } } return list; } private String getCustomTagValue(Node unicodeNode) { Node textLineNode = unicodeNode.getParentNode().getParentNode(); return textLineNode.getAttributes().getNamedItem("custom").getTextContent(); } private <A extends PAGEXMLAnnotation> List<A> extractValuesFromLine(List<A> list, Node unicodeNode, PAGEXMLValueBuilder builder, int pageIndex, int lineIndex) { String textContent = unicodeNode.getTextContent(); String customTagValue = getCustomTagValue(unicodeNode); return extractValueFromLine(list, textContent, customTagValue, builder, pageIndex, lineIndex); } public PAGEXMLIndex createIndex(String path) { PAGEXMLIndex index = new PAGEXMLIndex(); index.addTokens(extractValues(path, new TokenBuilder())); return index; } public List<Pair<String, String>> extractTextFromDocumentPairwise(String path1, String path2) { List<File> fileList1 = getFileList(path1); List<File> fileList2 = getFileList(path2); if (fileList1.size() != fileList2.size()) { return createWholeDocumentPair(fileList1, fileList2); } return createPageWisePairs(fileList1, fileList2); } public List<Pair<String, String>> extractTextFromFilePairwise(String path1, String path2) { List<File> fileList1 = Arrays.asList(new File[] { new File(path1) }); List<File> fileList2 = Arrays.asList(new File[] { new File(path2) }); if (fileList1.size() != fileList2.size()) { return createWholeDocumentPair(fileList1, fileList2); } return createPageWisePairs(fileList1, fileList2); } private List<Pair<String, String>> createWholeDocumentPair(List<File> fileList1, List<File> fileList2) { String text1 = extractTextFromFileList(fileList1, "\n"); String text2 = extractTextFromFileList(fileList2, "\n"); return Arrays.asList(new Pair(text1, text2)); } private List<Pair<String, String>> createPageWisePairs(List<File> fileList1, List<File> fileList2) { List<Pair<String, String>> pagePairs = new LinkedList<>(); for (int fileIndex = 0; fileIndex < fileList1.size(); fileIndex++) { List<Node> nodeList1 = getNodesFromFile(fileList1.get(fileIndex)); List<Node> nodeList2 = getNodesFromFile(fileList2.get(fileIndex)); pagePairs.addAll(getPagePairs(nodeList1, nodeList2)); } return pagePairs; } private List<Pair<String, String>> getPagePairs(List<Node> nodeList1, List<Node> nodeList2) { if (allAreAlligned(nodeList1, nodeList2)) { List<Pair<String, String>> linePairs = new LinkedList<>(); for (int nodeIndex = 0; nodeIndex < nodeList1.size(); nodeIndex++) { String text1 = getTextFromNode(nodeList1.get(nodeIndex)); String text2 = getTextFromNode(nodeList2.get(nodeIndex)); linePairs.add(new Pair(text1, text2)); } return linePairs; } else { String text1 = getTextFromNodeList(nodeList1); String text2 = getTextFromNodeList(nodeList2); return Arrays.asList(new Pair(text1, text2)); } } private boolean allAreAlligned(List<Node> nodeList1, List<Node> nodeList2) { if (nodeList1.size() != nodeList2.size()) return false; for (int nodeIndex = 0; nodeIndex < nodeList1.size(); nodeIndex++) { if (!equalsBaseline(nodeList1.get(nodeIndex), nodeList2.get(nodeIndex))) return false; } return true; } private static Node getChild(Node parent, String name) { NodeList childNodes = parent.getChildNodes(); Node res = null; for (int i = 0; i < childNodes.getLength(); i++) { Node child = childNodes.item(i); if (child.getNodeName().equals(name)) { if (res != null) { throw new RuntimeException("there are more than one child with this name"); } res = child; } } return res; } private boolean equalsBaseline(Node node1, Node node2) { if (node1 == node2) { return true; } String points1 = getChild(node1.getParentNode().getParentNode(), "Baseline").getAttributes() .getNamedItem("points").getTextContent(); String points2 = getChild(node2.getParentNode().getParentNode(), "Baseline").getAttributes() .getNamedItem("points").getTextContent(); return points1.equals(points2); } }