Java tutorial
/* * MAE - Multi-purpose Annotation Environment * * Copyright Keigh Rim (krim@brandeis.edu) * Department of Computer Science, Brandeis University * Original program by Amber Stubbs (astubbs@cs.brandeis.edu) * * MAE is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, @see <a href="http://www.gnu.org/licenses">http://www.gnu.org/licenses</a>. * * For feedback, reporting bugs, use the project on Github * @see <a href="https://github.com/keighrim/mae-annotation">https://github.com/keighrim/mae-annotation</a>. */ package edu.brandeis.cs.nlp.mae.io; import edu.brandeis.cs.nlp.mae.database.MaeDBException; import edu.brandeis.cs.nlp.mae.database.MaeDriverI; import edu.brandeis.cs.nlp.mae.model.ArgumentType; import edu.brandeis.cs.nlp.mae.model.AttributeType; import edu.brandeis.cs.nlp.mae.model.TagType; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Scanner; import java.util.regex.Matcher; import java.util.regex.Pattern; /* * DTDLoader is ... */ public class DTDLoader { private static final Logger logger = LoggerFactory.getLogger(DTDLoader.class.getName()); private MaeDriverI driver; private ArrayList<TagType> loadedTagTypes; private HashMap<String, String> prefixes; public DTDLoader(MaeDriverI driver) throws MaeIODTDException { this.driver = driver; this.prefixes = new HashMap<>(); this.loadedTagTypes = new ArrayList<>(); } public boolean read(File file) throws MaeIODTDException, MaeDBException { try { logger.info("reading annotation scheme from: " + file.getAbsolutePath()); driver.setTaskFileName(file.getAbsolutePath()); return this.read(new FileInputStream(file)); } catch (FileNotFoundException e) { String message = "file not found: " + file.getAbsolutePath(); logger.error(message); throw new MaeIODTDException(message, e); } } public boolean read(String string) throws MaeIODTDException, MaeDBException { logger.debug("reading annotation scheme from plain JAVA string"); return this.read(IOUtils.toInputStream(string)); } public boolean read(InputStream stream) throws MaeIODTDException, MaeDBException { Scanner sc = new Scanner(stream, "UTF-8"); int lineNum = 1; while (sc.hasNextLine()) { String next = sc.nextLine(); // getting rid of comments if (next.contains("<!--")) { while (sc.hasNextLine() && !next.contains("-->")) { next = sc.nextLine(); lineNum++; } next = sc.nextLine(); } //then, concatenate lines about a tag into one string String element = ""; if (next.contains("<")) { element += next; while (sc.hasNextLine() && !next.contains(">")) { next = sc.nextLine(); lineNum++; element += next; } } lineNum++; // remove some problematic unicode characters before processing element = normalizeLine(element); process(element, lineNum); } validateLinkTagTypes(); return validateReadTask(); } private boolean validateReadTask() throws MaeDBException { return driver.getAllTagTypes().size() > 1; } private void validateLinkTagTypes() throws MaeDBException { for (TagType linktag : driver.getLinkTagTypes()) { if (linktag.getArgumentTypes().size() == 0) { addDefaultArguments(linktag); } } } private void addDefaultArguments(TagType linktag) throws MaeDBException { // default arguments are NOT req, but note that args are always IDref driver.createArgumentType(linktag, "from"); driver.createArgumentType(linktag, "to"); } public String normalizeLine(String line) { return line.trim().replaceAll("\\s+", " ").replaceAll("[\u201C\u201D]", "\"").replaceAll("[\u2018\u2019]", "'"); } private void process(String element, int lineNum) throws MaeIODTDException, MaeDBException { if (element.startsWith("<!ELEMENT")) { processTagType(element, lineNum); } if (element.startsWith("<!ATTLIST")) { processAttribute(element, lineNum); } if (element.startsWith("<!ENTITY")) { processMeta(element, lineNum); } } private void processTagType(String element, int lineNum) throws MaeIODTDException, MaeDBException { Pattern tTypePattern = Pattern.compile("<! *ELEMENT +(\\S+) +(\\bEMPTY\\b|\\( *(#\\bPCDATA\\b)\\s*\\)) *>"); Matcher tTypeMatcher = tTypePattern.matcher(element); if (tTypeMatcher.find()) { String name = tTypeMatcher.group(1); boolean isLink = tTypeMatcher.group(3) == null || !tTypeMatcher.group(3).equals("#PCDATA"); String prefix = generatePrefix(name); logger.debug(String.format("adding a tag type: %s (%s)", name, prefix)); loadedTagTypes.add(driver.createTagType(name, prefix, isLink)); } else { this.error(String.format("DTD seems to be ill-formed: %s at %d", element, lineNum)); } } private String generatePrefix(String fullname) throws MaeIODTDException { int prefixLen = 1; String prefix = fullname.substring(0, prefixLen); while (prefixes.values().contains(prefix)) { if (prefix.length() >= fullname.length()) { String message = "duplicate TagType name found: " + fullname; logger.error(message); throw new MaeIODTDException(message); } prefixLen++; prefix = fullname.substring(0, prefixLen); } prefixes.put(fullname, prefix); return prefix; } private void processMeta(String element, int lineNum) throws MaeIODTDException, MaeDBException { // currently it can only process "internal parsed entities" element of DTD Pattern elementPattern = Pattern.compile("<!\\s*ENTITY +(.+) +\"(.+)\" *>"); Matcher elementMatcher = elementPattern.matcher(element); boolean add; add = elementMatcher.matches() && addMetadata(elementMatcher.group(1), elementMatcher.group(2)); if (!add) { this.error(String.format("error while adding a metadata: %s at %d", element, lineNum)); } } private boolean addMetadata(String key, String value) throws MaeDBException { boolean success; switch (key) { case "name": driver.setTaskName(value); logger.debug("adding DTD name: " + value); success = true; break; default: logger.debug("unresolved identifier: " + key); success = false; } return success; } private void processAttribute(String element, int lineNum) throws MaeIODTDException, MaeDBException { Pattern attPattern = Pattern.compile( "<! *ATTLIST +(\\S+) +(\\S+) +(\\( *.+ *\\)|\\bCDATA\\b|\\bID\\b|\\bIDREF\\b)? *(prefix=\"(.+)\")? *(#\\bREQUIRED\\b|#\\bIMPLIED\\b)? *(\"(.+)\")?"); Matcher attMatcher = attPattern.matcher(element); if (attMatcher.find()) { String tagTypeName = attMatcher.group(1); String attTypeName = attMatcher.group(2); String valueset = attMatcher.group(3); if (valueset == null) { valueset = "CDATA"; } String prefix = attMatcher.group(5); boolean required = attMatcher.group(6) != null && attMatcher.group(6).equals("#REQUIRED"); String defaultValue = attMatcher.group(8); TagType tagtype = isTagTypeLoaded(tagTypeName); if (tagtype == null) { this.error("tag type is not define for an attribute/argument: " + attTypeName); } else if (attTypeName.matches("arg[0-9]+")) { defineArgument(lineNum, tagtype, attTypeName, valueset, prefix, required, defaultValue); } else { defineAttribute(lineNum, tagtype, attTypeName, valueset, prefix, required, defaultValue); } } else { this.error(String.format("DTD seems to be ill-formed: \"%s\" at %d", element, lineNum)); } } private AttributeType defineAttribute(int lineNum, TagType tagType, String attTypeName, String valueset, String prefix, boolean required, String defaultValue) throws MaeIODTDException, MaeDBException { AttributeType type = null; switch (valueset) { case "ID": if (!attTypeName.equals("id")) { this.error("value type \"ID\" should have name \"id\": " + lineNum); } else if (prefix != null) { if (prefixes.values().contains(prefix)) { this.error(String.format("prefix \"%s\" is already being used", prefix)); } logger.debug(String.format("setting a custom prefix to tag type \"%s\" : %s ", tagType.getName(), attTypeName)); driver.setTagTypePrefix(tagType, prefix); prefixes.put(tagType.getName(), prefix); } break; case "IDREF": type = addAttributeType(tagType, attTypeName); logger.debug("setting as id-referencing attribute: " + attTypeName); driver.setAttributeTypeIDRef(type, true); case "CDATA": if ((attTypeName.equals("spans") || attTypeName.equals("start")) && !required) { logger.debug("setting as non-consuming: " + tagType.getName()); driver.setTagTypeNonConsuming(tagType, true); } else { type = addAttributeType(tagType, attTypeName); } break; default: String[] validValues = valueset.replaceAll("(\\( *| *\\))", "").split(" \\| "); if (validValues.length < 2) { this.error(String.format("the set of values should have two or more values: \"%s\" at %d", valueset, lineNum)); } type = addAttributeType(tagType, attTypeName); logger.debug(String.format("setting valid value set to \"%s\": %s", attTypeName, Arrays.toString(validValues))); driver.setAttributeTypeValueSet(type, Arrays.asList(validValues)); } if (type != null) { if (defaultValue != null) { if (type.getValuesetAsList().size() == 0 || type.getValuesetAsList().contains(defaultValue)) { logger.debug(String.format("setting default value to \"%s\": %s", attTypeName, defaultValue)); driver.setAttributeTypeDefaultValue(type, defaultValue); } else { this.error(String.format("Default value \"%s\" is not in the pre-defined value set %s: at %d", defaultValue, type.getValuesetAsList().toString(), lineNum)); } } if (required) { logger.debug("setting to a required attribute: " + attTypeName); driver.setAttributeTypeRequired(type, true); } } return type; } private AttributeType addAttributeType(TagType tagType, String attTypeName) throws MaeDBException { logger.debug(String.format("adding a new attribute type attached to \"%s\": %s", tagType.getName(), attTypeName)); return driver.createAttributeType(tagType, attTypeName); } private ArgumentType defineArgument(int lineNum, TagType tagType, String argTypeName, String valueset, String prefix, boolean required, String defaultValue) throws MaeIODTDException, MaeDBException { ArgumentType type = null; if (!tagType.isLink()) { this.error(String.format("extent tag \"%s\" can't have an argument \"%s\" at %d", tagType.getName(), argTypeName, lineNum)); } else if (defaultValue != null) { this.error("arguments cannot have a default value: " + lineNum); } else if (prefix != null && !valueset.equals("IDREF")) { this.error("argument definition should be set to \"IDREF\": " + lineNum); } else if (prefix != null) { type = driver.createArgumentType(tagType, prefix); } else { type = driver.createArgumentType(tagType, argTypeName); } if (required && type != null) { logger.debug("setting to a required argument: " + argTypeName); driver.setArgumentTypeRequired(type, true); } return type; } private TagType isTagTypeLoaded(String name) throws MaeDBException { for (TagType tagtype : loadedTagTypes) { if (tagtype.getName().equals(name)) { return driver.getTagTypeByName(name); } } return null; } private void error(String message) throws MaeIODTDException { logger.error(message); throw new MaeIODTDException(message); } }