List of usage examples for javax.xml.stream XMLEventReader peek
public XMLEvent peek() throws XMLStreamException;
From source file:com.streamsets.pipeline.lib.xml.StreamingXmlParser.java
void skipIgnorable(XMLEventReader reader) throws XMLStreamException { while (reader.hasNext() && isIgnorable(reader.peek())) { reader.nextEvent();/* ww w .ja va2 s.co m*/ } }
From source file:edu.jhu.hlt.concrete.ingesters.bolt.BoltForumPostIngester.java
/** * Move the iterator so that a call to nextEvent will return the beginning of a post tag. * * @param rdr//from w w w .j ava 2 s . co m * @throws XMLStreamException */ private void iterateToPosts(final XMLEventReader rdr) throws XMLStreamException { // Peek at the next element. XMLEvent fp = rdr.peek(); // If start element and part == "post", return. if (fp.isStartElement()) { StartElement se = fp.asStartElement(); if (se.getName().getLocalPart().equals(POST_LOCAL_NAME)) return; else // Churn through non-post start tags. this.handleNonPostStartElement(rdr); } else // Drop. rdr.nextEvent(); this.iterateToPosts(rdr); }
From source file:de.tudarmstadt.ukp.dkpro.core.io.tiger.TigerXmlReader.java
@Override public void getNext(JCas aJCas) throws IOException, CollectionException { Resource res = nextFile();//from w w w . ja v a 2 s. c o m initCas(aJCas, res); posMappingProvider.configure(aJCas.getCas()); InputStream is = null; try { is = CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()); XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); XMLEventReader xmlEventReader = xmlInputFactory.createXMLEventReader(is); JAXBContext context = JAXBContext.newInstance(Meta.class, AnnotationDecl.class, TigerSentence.class); Unmarshaller unmarshaller = context.createUnmarshaller(); JCasBuilder jb = new JCasBuilder(aJCas); XMLEvent e = null; while ((e = xmlEventReader.peek()) != null) { if (isStartElement(e, "s")) { readSentence(jb, unmarshaller.unmarshal(xmlEventReader, TigerSentence.class).getValue()); } else { xmlEventReader.next(); } } jb.close(); // Can only do that after the builder is closed, otherwise the text is not yet set in // the // CAS and we get "null" for all token strings. if (pennTreeEnabled) { for (ROOT root : select(aJCas, ROOT.class)) { PennTree pt = new PennTree(aJCas, root.getBegin(), root.getEnd()); PennTreeNode rootNode = PennTreeUtils.convertPennTree(root); pt.setPennTree(PennTreeUtils.toPennTree(rootNode)); pt.addToIndexes(); } } } catch (XMLStreamException ex1) { throw new IOException(ex1); } catch (JAXBException ex2) { throw new IOException(ex2); } finally { closeQuietly(is); } }
From source file:org.javelin.sws.ext.bind.internal.model.ElementPattern.java
@Override public T consume(XMLEventReader eventReader, UnmarshallingContext context) throws XMLStreamException { // just skip element's START_ELEMENT event StartElement startElement = eventReader.nextEvent().asStartElement(); // StartElement may contain attributes - these are NOT available as separate events in eventReader.nextEvent()! Iterator<?> attributes = startElement.getAttributes(); List<Attribute> attrList = new LinkedList<Attribute>(); while (attributes.hasNext()) { Attribute a = (Attribute) attributes.next(); attrList.add(a);//www . j a va2s. co m } T value = this.nestedPattern.consumeValue(new AttributesAwareXMLEventReader(eventReader, attrList), context); // skip element's END_ELEMENT event while (eventReader.peek() != null) { XMLEvent ev = eventReader.nextEvent(); if (ev.getEventType() == XMLStreamConstants.END_ELEMENT) break; } return value; }
From source file:org.javelin.sws.ext.bind.internal.model.ComplexTypePattern.java
@Override public T consumeValue(XMLEventReader eventReader, UnmarshallingContext context) throws XMLStreamException { // first create an object to be filled (using PropertyAccessors - direct or bean) according to the content model T object = BeanUtils.instantiate(this.getJavaType()); // the order is dictated by incoming events, not by the mode // TODO: create a property to enable strict unmarshalling - dictated by content model // only this (ContentModel) pattern iterates over XML Events XMLEvent event = null;/* w w w. ja va 2 s. co m*/ PropertyMetadataValue<T, ?> pmv = null; // this loop will only handle first level of start elements and only single end element // deeper levels will be handled by nested patterns while (true) { boolean end = false; event = eventReader.peek(); pmv = null; switch (event.getEventType()) { case XMLStreamConstants.ATTRIBUTE: pmv = this.consumeNestedAttribute(eventReader, context); break; case XMLStreamConstants.CDATA: case XMLStreamConstants.CHARACTERS: // TODO: XMLEvent.ENTITY_REFERENCE? if (this.simpleContent != null) { pmv = this.consumeSimpleContent(eventReader, context); break; } case XMLStreamConstants.COMMENT: case XMLStreamConstants.DTD: case XMLStreamConstants.SPACE: case XMLStreamConstants.ENTITY_DECLARATION: case XMLStreamConstants.NOTATION_DECLARATION: case XMLStreamConstants.PROCESSING_INSTRUCTION: eventReader.nextEvent(); break; case XMLStreamConstants.ENTITY_REFERENCE: // TODO: XMLEvent.ENTITY_REFERENCE? eventReader.nextEvent(); break; case XMLStreamConstants.START_DOCUMENT: // strange break; case XMLStreamConstants.START_ELEMENT: pmv = this.consumeNestedElement(eventReader, context); break; case XMLStreamConstants.END_ELEMENT: // TODO: in mixed content there will be more than one end element it this content model's level case XMLStreamConstants.END_DOCUMENT: end = true; break; } if (end) break; if (pmv != null) pmv.getMetadata().setValue(object, pmv.getValue()); } return (T) object; }
From source file:de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.java
public MultiValueHashMap<String, String> getPageTitle2Redirects(InputStream sWikipediaDump) throws FileNotFoundException, XMLStreamException { // <text xml:space="preserve">#REDIRECT [[Autopoiesis]]</text> // <text xml:space="preserve">#REDIRECT:[[Hans Leo Haler]]</text> // <text xml:space="preserve">#redirect [[Weier Hai]]</text> // #weiterleitung // <page> // <title>Autopoiesis</title> Logger.getLogger(WikipediaDumpParser.class.getName()).info("will collect redirects from wikipedia dump..."); MultiValueHashMap<String, String> hsPageTitle2Redirects = new MultiValueBalancedTreeMap<String, String>(); String strCurrentTitle = ""; XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); XMLEventReader xmlEventReader = xmlInputFactory.createXMLEventReader(sWikipediaDump, "Utf-8"); int iTitlesRead = 0; while (xmlEventReader.hasNext()) { XMLEvent xmlEvent = xmlEventReader.nextEvent(); if (!xmlEvent.isStartElement()) continue; // wenn wir einen Title haben, dann merken wir uns den, falls wir ihn brauchen if (xmlEvent.asStartElement().getName().getLocalPart().equals("title")) { strCurrentTitle = readNextCharEventsText(xmlEventReader); iTitlesRead++;/*from w w w. j av a2 s .co m*/ if (iTitlesRead % 200000 == 0) Logger.getLogger(WikipediaDumpParser.class.getName()) .info("read doc #" + StringUtils.beautifyNumber(iTitlesRead)); continue; } if (!xmlEvent.asStartElement().getName().getLocalPart().equals("text")) continue; // jetzt haben wir ein text-tag. Wir schauen, ob jetzt ein redirect kommt // entweder kommt ein charEvent oder ein EndEvent. Leere Texte gibts wohl auch XMLEvent nextEvent = xmlEventReader.peek(); if (!nextEvent.isCharacters()) continue; String strCharEventData = readNextCharEventsText(xmlEventReader); if (strCharEventData == null) continue; strCharEventData = strCharEventData.trim(); boolean bRedirect = false; if (strCharEventData.length() >= 9 && strCharEventData.substring(0, 9).equalsIgnoreCase("#redirect")) bRedirect = true; if (!bRedirect && strCharEventData.length() >= 8 && strCharEventData.substring(0, 8).equalsIgnoreCase("redirect") && !strCharEventData.contains("\n")) bRedirect = true; if (!bRedirect && strCharEventData.length() >= 14 && strCharEventData.substring(0, 14).equalsIgnoreCase("#weiterleitung")) bRedirect = true; if (!bRedirect && strCharEventData.length() >= 13 && strCharEventData.substring(0, 13).equalsIgnoreCase("weiterleitung") && !strCharEventData.contains("\n")) bRedirect = true; if (!bRedirect) continue; // wir haben einen redirect - der wird in unsere Datenstruktur eingetragen int iStart = strCharEventData.indexOf("[["); int iEnd = strCharEventData.indexOf("]]"); if (iStart < 0 || iEnd < 0) continue; if (iEnd <= iStart) continue; if ((iStart + 2) > strCharEventData.length() || iEnd > strCharEventData.length()) continue; String strRedirectTarget = strCharEventData.substring(iStart + 2, iEnd).trim(); hsPageTitle2Redirects.add(strRedirectTarget, strCurrentTitle); // if("Venceslav Konstantinov".equalsIgnoreCase(strCurrentTitle) || "Venceslav Konstantinov".equalsIgnoreCase(strRedirectTarget)) // System.out.println("redirect found: (" + hsPageTitle2Redirects.keySize() + ") " + strCurrentTitle + " => '" + strRedirectTarget + "'"); } Logger.getLogger(WikipediaDumpParser.class.getName()) .info("Redirects found: " + StringUtils.beautifyNumber(hsPageTitle2Redirects.valueSize())); return hsPageTitle2Redirects; }
From source file:org.alex73.osm.converters.bel.Convert.java
public static void main(String[] args) throws Exception { loadStreetNamesForHouses();/* w w w . ja va2 s. c om*/ InputStream in = new BZip2CompressorInputStream( new BufferedInputStream(new FileInputStream("tmp/belarus-latest.osm.bz2"), BUFFER_SIZE)); // create xml event reader for input stream XMLEventFactory eventFactory = XMLEventFactory.newInstance(); XMLEvent newLine = eventFactory.createCharacters("\n"); XMLInputFactory xif = XMLInputFactory.newInstance(); XMLOutputFactory xof = XMLOutputFactory.newInstance(); XMLEventReader reader = xif.createXMLEventReader(in); XMLEventWriter wrCyr = xof.createXMLEventWriter( new BufferedOutputStream(new FileOutputStream("tmp/belarus-bel.osm"), BUFFER_SIZE)); XMLEventWriter wrInt = xof.createXMLEventWriter( new BufferedOutputStream(new FileOutputStream("tmp/belarus-intl.osm"), BUFFER_SIZE)); // initialize jaxb JAXBContext jaxbCtx = JAXBContext.newInstance(Node.class, Way.class, Relation.class); Unmarshaller um = jaxbCtx.createUnmarshaller(); Marshaller m = jaxbCtx.createMarshaller(); m.setProperty(Marshaller.JAXB_FRAGMENT, true); m.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, Boolean.TRUE); XMLEvent e = null; while ((e = reader.peek()) != null) { boolean processed = false; if (e.isStartElement()) { StartElement se = (StartElement) e; switch (se.getName().getLocalPart()) { case "way": Way way = um.unmarshal(reader, Way.class).getValue(); if (way.getId() == 25439425) { System.out.println(); } fixBel(way.getTag(), "name:be", "name"); String nameBeHouse = houseStreetBe.get(way.getId()); if (nameBeHouse != null) { setTag(way.getTag(), "addr:street", nameBeHouse); } m.marshal(way, wrCyr); fixInt(way.getTag()); m.marshal(way, wrInt); wrCyr.add(newLine); wrInt.add(newLine); processed = true; break; case "node": Node node = um.unmarshal(reader, Node.class).getValue(); fixBel(node.getTag(), "name:be", "name"); // fixBel(node.getTag(),"addr:street:be","addr:street"); m.marshal(node, wrCyr); fixInt(node.getTag()); m.marshal(node, wrInt); wrCyr.add(newLine); wrInt.add(newLine); processed = true; break; case "relation": Relation relation = um.unmarshal(reader, Relation.class).getValue(); fixBel(relation.getTag(), "name:be", "name"); // fixBel(relation.getTag(),"addr:street:be","addr:street"); m.marshal(relation, wrCyr); fixInt(relation.getTag()); m.marshal(relation, wrInt); wrCyr.add(newLine); wrInt.add(newLine); processed = true; break; } } if (!processed) { wrCyr.add(e); wrInt.add(e); } reader.next(); } wrCyr.flush(); wrCyr.close(); wrInt.flush(); wrInt.close(); System.out.println("UniqueTranslatedTags: " + uniqueTranslatedTags); }
From source file:org.apache.olingo.client.core.serialization.AtomDeserializer.java
private PropertyType guessPropertyType(final XMLEventReader reader, final EdmTypeInfo typeInfo) throws XMLStreamException { XMLEvent child = null;/*from w w w . j a va 2 s. co m*/ while (reader.hasNext() && child == null) { final XMLEvent event = reader.peek(); if (event.isCharacters() && event.asCharacters().isWhiteSpace()) { reader.nextEvent(); } else { child = event; } } final PropertyType type; if (child == null) { type = typeInfo == null || typeInfo.isPrimitiveType() ? PropertyType.PRIMITIVE : PropertyType.ENUM; } else { if (child.isStartElement()) { if (Constants.NS_GML.equals(child.asStartElement().getName().getNamespaceURI())) { type = PropertyType.PRIMITIVE; } else if (elementQName.equals(child.asStartElement().getName())) { type = PropertyType.COLLECTION; } else { type = PropertyType.COMPLEX; } } else if (child.isCharacters()) { type = typeInfo == null || typeInfo.isPrimitiveType() ? PropertyType.PRIMITIVE : PropertyType.ENUM; } else { type = PropertyType.EMPTY; } } return type; }
From source file:org.apache.olingo.client.core.serialization.AtomDeserializer.java
private StartElement getStartElement(final XMLEventReader reader) throws XMLStreamException { while (reader.hasNext()) { final XMLEvent innerEvent = reader.peek(); if (innerEvent.isCharacters() && innerEvent.asCharacters().isWhiteSpace()) { reader.nextEvent();//from ww w . j a v a 2 s .c om } else if (innerEvent.isStartElement()) { return innerEvent.asStartElement(); } else if (innerEvent.isEndElement() && inlineQName.equals(innerEvent.asEndElement().getName())) { return null; } } return null; }
From source file:org.apache.olingo.commons.core.data.AtomPropertyDeserializer.java
private ODataPropertyType guessPropertyType(final XMLEventReader reader) throws XMLStreamException { XMLEvent child = null;// w w w . j a v a 2 s. co m while (reader.hasNext() && child == null) { final XMLEvent event = reader.peek(); if (event.isCharacters() && event.asCharacters().isWhiteSpace()) { reader.nextEvent(); } else { child = event; } } final ODataPropertyType type; if (child == null) { type = ODataPropertyType.PRIMITIVE; } else { if (child.isStartElement()) { if (Constants.NS_GML.equals(child.asStartElement().getName().getNamespaceURI())) { type = ODataPropertyType.PRIMITIVE; } else if (elementQName.equals(child.asStartElement().getName())) { type = ODataPropertyType.COLLECTION; } else { type = ODataPropertyType.COMPLEX; } } else if (child.isCharacters()) { type = ODataPropertyType.PRIMITIVE; } else { type = ODataPropertyType.EMPTY; } } return type; }