List of usage examples for javax.xml.stream XMLInputFactory setProperty
public abstract void setProperty(java.lang.String name, Object value) throws java.lang.IllegalArgumentException;
From source file:tpt.dbweb.cat.io.TaggedTextXMLReader.java
private Iterator<TaggedText> getIterator(InputStream is, String errorMessageInfo) { XMLStreamReader tmpxsr = null; try {//from www .ja v a2 s . com XMLInputFactory xif = XMLInputFactory.newInstance(); xif.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); xif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, false); xif.setProperty(XMLInputFactory.IS_VALIDATING, false); tmpxsr = xif.createXMLStreamReader(is); } catch (XMLStreamException | FactoryConfigurationError e) { e.printStackTrace(); return null; } final XMLStreamReader xsr = tmpxsr; return new PeekIterator<TaggedText>() { @Override protected TaggedText internalNext() { ArrayList<TextSpan> openMarks = new ArrayList<>(); StringBuilder pureTextSB = new StringBuilder(); ArrayList<TextSpan> marks = new ArrayList<>(); marks.add(new TextSpan(null, 0, 0)); TaggedText tt = null; try { loop: while (xsr.hasNext()) { xsr.next(); int event = xsr.getEventType(); switch (event) { case XMLStreamConstants.START_ELEMENT: if ("articles".equals(xsr.getLocalName())) { } else if ("article".equals(xsr.getLocalName())) { tt = new TaggedText(); for (int i = 0; i < xsr.getAttributeCount(); i++) { if ("id".equals(xsr.getAttributeLocalName(i))) { tt.id = xsr.getAttributeValue(i); } tt.info().put(xsr.getAttributeLocalName(i), xsr.getAttributeValue(i)); } } else if ("mark".equals(xsr.getLocalName())) { TextSpan tr = new TextSpan(null, pureTextSB.length(), pureTextSB.length()); for (int i = 0; i < xsr.getAttributeCount(); i++) { tr.info().put(xsr.getAttributeLocalName(i), xsr.getAttributeValue(i)); } openMarks.add(tr); } else if ("br".equals(xsr.getLocalName())) { // TODO: how to propagate tags from the input to the output? } else { log.warn("ignore tag " + xsr.getLocalName()); } break; case XMLStreamConstants.END_ELEMENT: if ("mark".equals(xsr.getLocalName())) { // search corresponding <mark ...> TextSpan tr = openMarks.remove(openMarks.size() - 1); if (tr == null) { log.warn("markend at " + xsr.getLocation().getCharacterOffset() + " has no corresponding mark tag"); break; } tr.end = pureTextSB.length(); marks.add(tr); } else if ("article".equals(xsr.getLocalName())) { tt.text = StringUtils.stripEnd(pureTextSB.toString().trim(), " \t\n"); pureTextSB = new StringBuilder(); tt.mentions = new ArrayList<>(); for (TextSpan mark : marks) { String entity = mark.info().get("entity"); if (entity == null) { entity = mark.info().get("annotation"); } if (entity != null) { EntityMention e = new EntityMention(tt.text, mark.start, mark.end, entity); String minMention = mark.info().get("min"); String mention = e.getMention(); if (minMention != null && !"".equals(minMention)) { Pattern p = Pattern.compile(Pattern.quote(minMention)); Matcher m = p.matcher(mention); if (m.find()) { TextSpan min = new TextSpan(e.text, e.start + m.start(), e.start + m.end()); e.min = min; if (m.find()) { log.warn("found " + minMention + " two times in \"" + mention + "\""); } } else { String prefix = Utility.findLongestPrefix(mention, minMention); log.warn("didn't find min mention '" + minMention + "' in text '" + mention + "', longest prefix found: '" + prefix + "' in article " + tt.id); } } mark.info().remove("min"); mark.info().remove("entity"); if (mark.info().size() > 0) { e.info().putAll(mark.info()); } tt.mentions.add(e); } } openMarks.clear(); marks.clear(); break loop; } break; case XMLStreamConstants.CHARACTERS: String toadd = xsr.getText(); if (pureTextSB.length() == 0) { toadd = StringUtils.stripStart(toadd, " \t\n"); } if (toadd.contains("thanks")) { log.info("test"); } pureTextSB.append(toadd); break; } } } catch (XMLStreamException e) { log.error("{}", errorMessageInfo); throw new RuntimeException(e); } if (tt != null && tt.mentions != null) { tt.mentions.sort(null); } return tt; } }; }