List of usage examples for javax.xml.stream XMLInputFactory createXMLEventReader
public abstract XMLEventReader createXMLEventReader(String systemId, java.io.InputStream stream) throws XMLStreamException;
From source file:Main.java
public static void main(String[] args) throws Exception { String filename = "yourXML.xml"; XMLInputFactory factory = XMLInputFactory.newInstance(); System.out.println("FACTORY: " + factory); XMLEventReader r = factory.createXMLEventReader(filename, new FileInputStream(filename)); while (r.hasNext()) { XMLEvent e = r.nextEvent(); System.out.println(e.toString()); }/*w w w . j ava 2 s. c om*/ }
From source file:Main.java
private static XMLEventReader getXMLEventReader(String filename) throws Exception { XMLInputFactory xmlif = null; XMLEventReader xmlr = null;/*from w w w . j a v a 2 s . c o m*/ xmlif = XMLInputFactory.newInstance(); xmlif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE); xmlif.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE); xmlif.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE); xmlif.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE); FileInputStream fis = new FileInputStream(filename); xmlr = xmlif.createXMLEventReader(filename, fis); return xmlr; }
From source file:ValidateStax.java
private static XMLEventReader getXMLEventReader(String filename) { XMLInputFactory xmlif = null; XMLEventReader xmlr = null;/* w ww. ja va 2s. c o m*/ try { xmlif = XMLInputFactory.newInstance(); xmlif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE); xmlif.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE); xmlif.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE); xmlif.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE); FileInputStream fis = new FileInputStream(filename); xmlr = xmlif.createXMLEventReader(filename, fis); } catch (Exception ex) { ex.printStackTrace(); } return xmlr; }
From source file:StreamSrcStAXRst.java
private static XMLEventReader getXMLEventReader(String filename) { XMLInputFactory xmlif = null; XMLEventReader xmlr = null;//from ww w . ja va 2 s . co m try { xmlif = XMLInputFactory.newInstance(); xmlif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE); xmlif.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE); xmlif.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE); xmlif.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE); FileInputStream fis = new FileInputStream(filename); xmlr = xmlif.createXMLEventReader(filename, fis); } catch (Exception ex) { ex.printStackTrace(); } return xmlr; }
From source file:com.vistatec.ocelot.xliff.okapi.OkapiXLIFFFactory.java
@Override public XLIFFVersion detectXLIFFVersion(File detectVersion) throws IOException, XMLStreamException { try (BOMInputStream bomInputStream = new BOMInputStream(new FileInputStream(detectVersion), ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE)) {//w ww . j a v a 2 s . c o m String bom = "UTF-8"; if (bomInputStream.hasBOM()) { bom = bomInputStream.getBOMCharsetName(); } XMLInputFactory xml = XMLInputFactory.newInstance(); XMLEventReader reader = xml.createXMLEventReader(bomInputStream, bom); while (reader.hasNext()) { XMLEvent event = reader.nextEvent(); switch (event.getEventType()) { case XMLEvent.START_ELEMENT: StartElement startElement = (StartElement) event; String localPart = startElement.getName().getLocalPart(); if (localPart.equals("xliff")) { @SuppressWarnings("unchecked") Iterator<Attribute> attrs = startElement.getAttributes(); while (attrs.hasNext()) { Attribute attr = attrs.next(); if (isXliffVersionAttributeName(attr.getName())) { String value = attr.getValue(); reader.close(); if ("2.0".equals(value)) { return XLIFFVersion.XLIFF20; } else { return XLIFFVersion.XLIFF12; } } } } break; default: break; } } throw new IllegalStateException("Could not detect XLIFF version"); } }
From source file:eionet.webq.converter.JsonXMLBidirectionalConverter.java
/** * Template for conversion.// www .j av a 2 s.com * * @param inputFactory input factory. * @param outputFactory output factory. * @param source source to convert. * @return conversion result as byte array. */ private byte[] convert(XMLInputFactory inputFactory, XMLOutputFactory outputFactory, byte[] source) { InputStream input = new ByteArrayInputStream(source); ByteArrayOutputStream output = new ByteArrayOutputStream(); try { XMLEventReader reader = inputFactory.createXMLEventReader(input, "utf-8"); XMLEventWriter writer = outputFactory.createXMLEventWriter(output, "utf-8"); writer = new PrettyXMLEventWriter(writer); writer.add(reader); closeQuietly(reader, writer); return output.toByteArray(); } catch (XMLStreamException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(output); IOUtils.closeQuietly(input); } }
From source file:com.act.lcms.MzMLParser.java
public Iterator<S> getIterator(String inputFile) throws ParserConfigurationException, IOException, XMLStreamException { DocumentBuilderFactory docFactory = mkDocBuilderFactory(); DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); final XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); final XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance(); return new Iterator<S>() { boolean inEntry = false; XMLEventReader xr = xmlInputFactory.createXMLEventReader(new FileInputStream(inputFile), "utf-8"); // TODO: is the use of the XML version/encoding tag definitely necessary? StringWriter w = new StringWriter().append(XML_PREAMBLE).append("\n"); XMLEventWriter xw = xmlOutputFactory.createXMLEventWriter(w); S next = null;//from w w w . j a v a 2 s .com /* Because we're handling the XML as a stream, we can only determine whether we have another Spectrum to return * by attempting to parse the next one. `this.next()` reads */ private S getNextSpectrum() { S spectrum = null; if (xr == null || !xr.hasNext()) { return null; } try { while (xr.hasNext()) { XMLEvent e = xr.nextEvent(); if (!inEntry && e.isStartElement() && e.asStartElement().getName().getLocalPart().equals((SPECTRUM_OBJECT_TAG))) { xw.add(e); inEntry = true; } else if (e.isEndElement() && e.asEndElement().getName().getLocalPart().equals(SPECTRUM_OBJECT_TAG)) { xw.add(e); xw.flush(); /* TODO: the XMLOutputFactory docs don't make it clear if/how events can be written directly into a new * document structure, so we incur the cost of extracting each spectrum entry, serializing it, and * re-reading it into its own document so it can be handled by XPath. Master this strange corner of the * Java ecosystem and get rid of <></>his doc -> string -> doc conversion. */ Document doc = docBuilder.parse(new ReaderInputStream(new StringReader(w.toString()))); spectrum = handleSpectrumEntry(doc); xw.close(); /* Note: this can also be accomplished with `w.getBuffer().setLength(0);`, but using a new event writer * seems safer. */ w = new StringWriter(); w.append(XML_PREAMBLE).append("\n"); xw = xmlOutputFactory.createXMLEventWriter(w); inEntry = false; // Don't stop parsing if handleSpectrumEntry didn't like this spectrum document. if (spectrum != null) { break; } } else if (inEntry) { // Add this element if we're in an entry xw.add(e); } } // We've reached the end of the document; close the reader to show that we're done. if (!xr.hasNext()) { xr.close(); xr = null; } } catch (Exception e) { // TODO: do better. We seem to run into this sort of thing with Iterators a lot... throw new RuntimeException(e); } return spectrum; } private S tryParseNext() { // Fail the attempt if the reader is closed. if (xr == null || !xr.hasNext()) { return null; } // No checks on whether we already have a spectrum stored: we expect the callers to do that. return getNextSpectrum(); } @Override public boolean hasNext() { // Prime the pump if the iterator doesn't have a value stored yet. if (this.next == null) { this.next = tryParseNext(); } // If we have an entry waiting, return true; otherwise read the next entry and return true if successful. return this.next != null; } @Override public S next() { // Prime the pump like we do in hasNext(). if (this.next == null) { this.next = tryParseNext(); } // Take available spectrum and return it. S res = this.next; /* Advance to the next element immediately, making next() do the heavy lifting most of the time. Otherwise, * the parsing will resume on hasNext(), which seems like it ought to be a light-weight operation. */ this.next = tryParseNext(); return res; } }; }
From source file:de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.java
public MultiValueHashMap<String, String> getPageTitle2Redirects(InputStream sWikipediaDump) throws FileNotFoundException, XMLStreamException { // <text xml:space="preserve">#REDIRECT [[Autopoiesis]]</text> // <text xml:space="preserve">#REDIRECT:[[Hans Leo Haler]]</text> // <text xml:space="preserve">#redirect [[Weier Hai]]</text> // #weiterleitung // <page> // <title>Autopoiesis</title> Logger.getLogger(WikipediaDumpParser.class.getName()).info("will collect redirects from wikipedia dump..."); MultiValueHashMap<String, String> hsPageTitle2Redirects = new MultiValueBalancedTreeMap<String, String>(); String strCurrentTitle = ""; XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); XMLEventReader xmlEventReader = xmlInputFactory.createXMLEventReader(sWikipediaDump, "Utf-8"); int iTitlesRead = 0; while (xmlEventReader.hasNext()) { XMLEvent xmlEvent = xmlEventReader.nextEvent(); if (!xmlEvent.isStartElement()) continue; // wenn wir einen Title haben, dann merken wir uns den, falls wir ihn brauchen if (xmlEvent.asStartElement().getName().getLocalPart().equals("title")) { strCurrentTitle = readNextCharEventsText(xmlEventReader); iTitlesRead++;//from w ww . ja va 2 s . co m if (iTitlesRead % 200000 == 0) Logger.getLogger(WikipediaDumpParser.class.getName()) .info("read doc #" + StringUtils.beautifyNumber(iTitlesRead)); continue; } if (!xmlEvent.asStartElement().getName().getLocalPart().equals("text")) continue; // jetzt haben wir ein text-tag. Wir schauen, ob jetzt ein redirect kommt // entweder kommt ein charEvent oder ein EndEvent. Leere Texte gibts wohl auch XMLEvent nextEvent = xmlEventReader.peek(); if (!nextEvent.isCharacters()) continue; String strCharEventData = readNextCharEventsText(xmlEventReader); if (strCharEventData == null) continue; strCharEventData = strCharEventData.trim(); boolean bRedirect = false; if (strCharEventData.length() >= 9 && strCharEventData.substring(0, 9).equalsIgnoreCase("#redirect")) bRedirect = true; if (!bRedirect && strCharEventData.length() >= 8 && strCharEventData.substring(0, 8).equalsIgnoreCase("redirect") && !strCharEventData.contains("\n")) bRedirect = true; if (!bRedirect && strCharEventData.length() >= 14 && strCharEventData.substring(0, 14).equalsIgnoreCase("#weiterleitung")) bRedirect = true; if (!bRedirect && strCharEventData.length() >= 13 && strCharEventData.substring(0, 13).equalsIgnoreCase("weiterleitung") && !strCharEventData.contains("\n")) bRedirect = true; if (!bRedirect) continue; // wir haben einen redirect - der wird in unsere Datenstruktur eingetragen int iStart = strCharEventData.indexOf("[["); int iEnd = strCharEventData.indexOf("]]"); if (iStart < 0 || iEnd < 0) continue; if (iEnd <= iStart) continue; if ((iStart + 2) > strCharEventData.length() || iEnd > strCharEventData.length()) continue; String strRedirectTarget = strCharEventData.substring(iStart + 2, iEnd).trim(); hsPageTitle2Redirects.add(strRedirectTarget, strCurrentTitle); // if("Venceslav Konstantinov".equalsIgnoreCase(strCurrentTitle) || "Venceslav Konstantinov".equalsIgnoreCase(strRedirectTarget)) // System.out.println("redirect found: (" + hsPageTitle2Redirects.keySize() + ") " + strCurrentTitle + " => '" + strRedirectTarget + "'"); } Logger.getLogger(WikipediaDumpParser.class.getName()) .info("Redirects found: " + StringUtils.beautifyNumber(hsPageTitle2Redirects.valueSize())); return hsPageTitle2Redirects; }
From source file:de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.java
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { try {// w w w . j a v a2 s . c om // wir iterieren schn ber die page-Eintrge. Darin gibt es dann title, timestamp, <contributor> => <username> und text. den text mssen // wir noch bereinigen. dazu nehmen wir eine Vorverarbeitung mit bliki - dazu mssen wir aber selbst nochmal den String vorbereiten und // nachbereinigen. Leider. WikipediaDumpParserConfig wikipediaDumpParserConfig = context.get(WikipediaDumpParserConfig.class); if (wikipediaDumpParserConfig == null) { Logger.getLogger(WikipediaDumpParser.class.getName()) .info("No wikipedia parser config found. Will take the default one."); wikipediaDumpParserConfig = new WikipediaDumpParserConfig(); } TikaInputStream tikaStream = TikaInputStream.get(stream); File fWikipediaDumpFile4Stream = tikaStream.getFile(); MultiValueHashMap<String, String> hsPageTitle2Redirects = new MultiValueHashMap<String, String>(); if (wikipediaDumpParserConfig.determinePageRedirects) hsPageTitle2Redirects = getPageTitle2Redirects(new FileInputStream(fWikipediaDumpFile4Stream)); HashSet<String> hsRedirectPageTitles = new HashSet<String>(hsPageTitle2Redirects.values()); String strCleanedText = ""; String strBaseURL = null; XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); XMLEventReader xmlEventReader = xmlInputFactory .createXMLEventReader(new FileInputStream(fWikipediaDumpFile4Stream), "Utf-8"); while (xmlEventReader.hasNext()) { XMLEvent xmlEvent = xmlEventReader.nextEvent(); if (xmlEvent.isEndElement() && xmlEvent.asEndElement().getName().getLocalPart().equals("page")) { if (metadata.size() == 0) continue; // den mimetype wollen wir auch noch in den Metadaten haben metadata.add(Metadata.CONTENT_TYPE, "application/wikipedia+xml"); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.startElement("p"); xhtml.characters(strCleanedText.toCharArray(), 0, strCleanedText.length()); xhtml.endElement("p"); xhtml.endDocument(); } if (!xmlEvent.isStartElement()) continue; // ##### die siteinfo if (strBaseURL == null && xmlEvent.asStartElement().getName().getLocalPart().equals("base")) { // http://de.wikipedia.org/wiki/Wikipedia:Hauptseite =>http://de.wikipedia.org/wiki/ strBaseURL = readNextCharEventsText(xmlEventReader); strBaseURL = strBaseURL.substring(0, strBaseURL.lastIndexOf("/") + 1); } // ##### die page if (xmlEvent.asStartElement().getName().getLocalPart().equals("page")) { for (String strKey : metadata.names()) metadata.remove(strKey); } // ##### der Title if (xmlEvent.asStartElement().getName().getLocalPart().equals("title")) { // wir merken uns immer den aktuellen Titel String strCurrentTitle = readNextCharEventsText(xmlEventReader); if (strCurrentTitle.equalsIgnoreCase("DuckDuckGo")) { int fasd = 8; } if (strCurrentTitle.toLowerCase().contains("duck") && strCurrentTitle.toLowerCase().contains("go")) { int is = 666; } // wenn der Titel eine redirect-Page ist, dann tragen wir die ganze Page aus der EventQueue aus, springen an das endPage, und // haben somit diese Seite ignoriert. Ferner ignorieren wir auch spezielle wikipedia-Seiten String strSmallTitle = strCurrentTitle.trim().toLowerCase(); if (hsRedirectPageTitles.contains(strCurrentTitle) || hsRedirectPageTitles.contains(strSmallTitle) || hsRedirectPageTitles.contains(strCurrentTitle.trim()) || strSmallTitle.startsWith("category:") || strSmallTitle.startsWith("kategorie:") || strSmallTitle.startsWith("vorlage:") || strSmallTitle.startsWith("template:") || strSmallTitle.startsWith("hilfe:") || strSmallTitle.startsWith("help:") || strSmallTitle.startsWith("wikipedia:") || strSmallTitle.startsWith("portal:") || strSmallTitle.startsWith("mediawiki:")) { while (true) { XMLEvent nextXmlEvent = xmlEventReader.nextEvent(); if (nextXmlEvent.isEndElement() && nextXmlEvent.asEndElement().getName().getLocalPart().equals("page")) break; } } else { metadata.add(Metadata.TITLE, strCurrentTitle); metadata.add(Metadata.SOURCE, strBaseURL + strCurrentTitle); for (String strRedirect : hsPageTitle2Redirects.get(strCurrentTitle)) { // wir ignorieren Titel, die sich lediglich durch gro/kleinschreibung unterscheiden if (!StringUtils.containsIgnoreCase(strRedirect, metadata.getValues(Metadata.TITLE))) metadata.add(Metadata.TITLE, strRedirect); } } continue; } // ##### der text if (xmlEvent.asStartElement().getName().getLocalPart().equals("text")) { String strText = readNextCharEventsText(xmlEventReader); if (wikipediaDumpParserConfig.parseLinksAndCategories) parseLinksAndCategories(strText, strBaseURL, metadata, handler); if (wikipediaDumpParserConfig.parseInfoBoxes) parseInfoBox(strText, metadata, handler); if (wikipediaDumpParserConfig.parseGeoCoordinates) parseGeoCoordinates(strText, metadata); // aufgrund einiger Defizite in dem verwendeten cleaner mssen wir hier leider noch zu-und nacharbeiten strText = strText.replaceAll("==\n", "==\n\n"); strText = strText.replaceAll("\n==", "\n\n=="); strCleanedText = m_wikiModel.render(new PlainTextConverter(), strText); strCleanedText = strCleanedText.replaceAll("\\{\\{", " "); strCleanedText = strCleanedText.replaceAll("\\}\\}", " "); strCleanedText = StringEscapeUtils.unescapeHtml4(strCleanedText); continue; } // ##### der timestamp if (xmlEvent.asStartElement().getName().getLocalPart().equals("timestamp")) { String strTimestamp = readNextCharEventsText(xmlEventReader); metadata.add(Metadata.MODIFIED, strTimestamp); continue; } // ##### der username if (xmlEvent.asStartElement().getName().getLocalPart().equals("username")) { String strUsername = readNextCharEventsText(xmlEventReader); metadata.add(Metadata.CREATOR, strUsername); continue; } } } catch (Exception e) { Logger.getLogger(WikipediaDumpParser.class.getName()).log(Level.SEVERE, "Error", e); } }
From source file:ca.phon.session.io.xml.v12.XMLSessionReader_v12.java
@Override public boolean canRead(File file) throws IOException { // open file and make sure the first // element is 'session' with the correct version boolean canRead = false; // use StAX to read only first element // create StAX reader XMLInputFactory factory = XMLInputFactory.newInstance(); XMLEventReader reader = null; try (FileInputStream source = new FileInputStream(file)) { //BufferedReader in = new BufferedReader(new InputStreamReader(source, "UTF-8")); XMLEventReader xmlReader = factory.createXMLEventReader(source, "UTF-8"); reader = factory.createFilteredReader(xmlReader, new XMLWhitespaceFilter()); XMLEvent evt;/*from w w w .jav a2 s . com*/ while (!(evt = reader.nextEvent()).isStartElement()) ; canRead = evt.asStartElement().getName().getLocalPart().equals("session") && evt.asStartElement().getAttributeByName(new QName("version")).getValue().equals("PB1.2"); } catch (XMLStreamException e) { throw new IOException(e); } return canRead; }