Example usage for javax.xml.stream XMLInputFactory createXMLEventReader

Introduction

In this page you can find the example usage for javax.xml.stream XMLInputFactory createXMLEventReader.

Prototype

public abstract XMLEventReader createXMLEventReader(String systemId, java.io.InputStream stream)
        throws XMLStreamException;

Source Link

Document

Create a new XMLEventReader from a java.io.InputStream

Usage

From source file:Main.java

public static void main(String[] args) throws Exception {
    String filename = "yourXML.xml";

    XMLInputFactory factory = XMLInputFactory.newInstance();
    System.out.println("FACTORY: " + factory);

    XMLEventReader r = factory.createXMLEventReader(filename, new FileInputStream(filename));

    while (r.hasNext()) {
        XMLEvent e = r.nextEvent();
        System.out.println(e.toString());
    }/*w  w w  . j ava 2 s. c om*/
}

From source file:Main.java

private static XMLEventReader getXMLEventReader(String filename) throws Exception {
    XMLInputFactory xmlif = null;
    XMLEventReader xmlr = null;/*from   w  w  w  .  j  a v  a 2  s  .  c o m*/
    xmlif = XMLInputFactory.newInstance();
    xmlif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
    xmlif.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
    xmlif.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
    xmlif.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);

    FileInputStream fis = new FileInputStream(filename);
    xmlr = xmlif.createXMLEventReader(filename, fis);

    return xmlr;
}

From source file:ValidateStax.java

private static XMLEventReader getXMLEventReader(String filename) {
    XMLInputFactory xmlif = null;
    XMLEventReader xmlr = null;/* w  ww.  ja va 2s.  c o  m*/
    try {
        xmlif = XMLInputFactory.newInstance();
        xmlif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
        xmlif.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
        xmlif.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
        xmlif.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);

        FileInputStream fis = new FileInputStream(filename);
        xmlr = xmlif.createXMLEventReader(filename, fis);
    } catch (Exception ex) {
        ex.printStackTrace();
    }
    return xmlr;
}

From source file:StreamSrcStAXRst.java

private static XMLEventReader getXMLEventReader(String filename) {

    XMLInputFactory xmlif = null;
    XMLEventReader xmlr = null;//from   ww  w .  ja va 2  s  . co m
    try {
        xmlif = XMLInputFactory.newInstance();
        xmlif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
        xmlif.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
        xmlif.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
        xmlif.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);

        FileInputStream fis = new FileInputStream(filename);
        xmlr = xmlif.createXMLEventReader(filename, fis);
    } catch (Exception ex) {
        ex.printStackTrace();
    }
    return xmlr;
}

From source file:com.vistatec.ocelot.xliff.okapi.OkapiXLIFFFactory.java

@Override
public XLIFFVersion detectXLIFFVersion(File detectVersion) throws IOException, XMLStreamException {
    try (BOMInputStream bomInputStream = new BOMInputStream(new FileInputStream(detectVersion),
            ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
            ByteOrderMark.UTF_32LE)) {//w ww . j a v  a 2 s .  c  o  m
        String bom = "UTF-8";
        if (bomInputStream.hasBOM()) {
            bom = bomInputStream.getBOMCharsetName();
        }

        XMLInputFactory xml = XMLInputFactory.newInstance();
        XMLEventReader reader = xml.createXMLEventReader(bomInputStream, bom);
        while (reader.hasNext()) {
            XMLEvent event = reader.nextEvent();
            switch (event.getEventType()) {
            case XMLEvent.START_ELEMENT:
                StartElement startElement = (StartElement) event;
                String localPart = startElement.getName().getLocalPart();
                if (localPart.equals("xliff")) {
                    @SuppressWarnings("unchecked")
                    Iterator<Attribute> attrs = startElement.getAttributes();
                    while (attrs.hasNext()) {
                        Attribute attr = attrs.next();
                        if (isXliffVersionAttributeName(attr.getName())) {
                            String value = attr.getValue();
                            reader.close();
                            if ("2.0".equals(value)) {
                                return XLIFFVersion.XLIFF20;
                            } else {
                                return XLIFFVersion.XLIFF12;
                            }
                        }
                    }
                }
                break;

            default:
                break;
            }
        }
        throw new IllegalStateException("Could not detect XLIFF version");
    }
}

From source file:eionet.webq.converter.JsonXMLBidirectionalConverter.java

/**
 * Template for conversion.//  www  .j av  a  2  s.com
 *
 * @param inputFactory input factory.
 * @param outputFactory output factory.
 * @param source source to convert.
 * @return conversion result as byte array.
 */
private byte[] convert(XMLInputFactory inputFactory, XMLOutputFactory outputFactory, byte[] source) {
    InputStream input = new ByteArrayInputStream(source);
    ByteArrayOutputStream output = new ByteArrayOutputStream();
    try {
        XMLEventReader reader = inputFactory.createXMLEventReader(input, "utf-8");
        XMLEventWriter writer = outputFactory.createXMLEventWriter(output, "utf-8");
        writer = new PrettyXMLEventWriter(writer);
        writer.add(reader);
        closeQuietly(reader, writer);
        return output.toByteArray();
    } catch (XMLStreamException e) {
        throw new RuntimeException(e);
    } finally {
        IOUtils.closeQuietly(output);
        IOUtils.closeQuietly(input);
    }
}

From source file:com.act.lcms.MzMLParser.java

public Iterator<S> getIterator(String inputFile)
        throws ParserConfigurationException, IOException, XMLStreamException {
    DocumentBuilderFactory docFactory = mkDocBuilderFactory();
    DocumentBuilder docBuilder = docFactory.newDocumentBuilder();

    final XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
    final XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance();

    return new Iterator<S>() {
        boolean inEntry = false;

        XMLEventReader xr = xmlInputFactory.createXMLEventReader(new FileInputStream(inputFile), "utf-8");
        // TODO: is the use of the XML version/encoding tag definitely necessary?
        StringWriter w = new StringWriter().append(XML_PREAMBLE).append("\n");
        XMLEventWriter xw = xmlOutputFactory.createXMLEventWriter(w);

        S next = null;//from  w w w .  j a  v  a 2 s .com

        /* Because we're handling the XML as a stream, we can only determine whether we have another Spectrum to return
         * by attempting to parse the next one.  `this.next()` reads
         */
        private S getNextSpectrum() {
            S spectrum = null;
            if (xr == null || !xr.hasNext()) {
                return null;
            }

            try {
                while (xr.hasNext()) {
                    XMLEvent e = xr.nextEvent();
                    if (!inEntry && e.isStartElement()
                            && e.asStartElement().getName().getLocalPart().equals((SPECTRUM_OBJECT_TAG))) {
                        xw.add(e);
                        inEntry = true;
                    } else if (e.isEndElement()
                            && e.asEndElement().getName().getLocalPart().equals(SPECTRUM_OBJECT_TAG)) {
                        xw.add(e);
                        xw.flush();
                        /* TODO: the XMLOutputFactory docs don't make it clear if/how events can be written directly into a new
                         * document structure, so we incur the cost of extracting each spectrum entry, serializing it, and
                         * re-reading it into its own document so it can be handled by XPath.  Master this strange corner of the
                         * Java ecosystem and get rid of <></>his doc -> string -> doc conversion. */
                        Document doc = docBuilder.parse(new ReaderInputStream(new StringReader(w.toString())));
                        spectrum = handleSpectrumEntry(doc);
                        xw.close();
                        /* Note: this can also be accomplished with `w.getBuffer().setLength(0);`, but using a new event writer
                         * seems safer. */
                        w = new StringWriter();
                        w.append(XML_PREAMBLE).append("\n");
                        xw = xmlOutputFactory.createXMLEventWriter(w);
                        inEntry = false;
                        // Don't stop parsing if handleSpectrumEntry didn't like this spectrum document.
                        if (spectrum != null) {
                            break;
                        }
                    } else if (inEntry) {
                        // Add this element if we're in an entry
                        xw.add(e);
                    }
                }

                // We've reached the end of the document; close the reader to show that we're done.
                if (!xr.hasNext()) {
                    xr.close();
                    xr = null;
                }
            } catch (Exception e) {
                // TODO: do better.  We seem to run into this sort of thing with Iterators a lot...
                throw new RuntimeException(e);
            }

            return spectrum;
        }

        private S tryParseNext() {
            // Fail the attempt if the reader is closed.
            if (xr == null || !xr.hasNext()) {
                return null;
            }

            // No checks on whether we already have a spectrum stored: we expect the callers to do that.
            return getNextSpectrum();
        }

        @Override
        public boolean hasNext() {
            // Prime the pump if the iterator doesn't have a value stored yet.
            if (this.next == null) {
                this.next = tryParseNext();
            }

            // If we have an entry waiting, return true; otherwise read the next entry and return true if successful.
            return this.next != null;
        }

        @Override
        public S next() {
            // Prime the pump like we do in hasNext().
            if (this.next == null) {
                this.next = tryParseNext();
            }

            // Take available spectrum and return it.
            S res = this.next;
            /* Advance to the next element immediately, making next() do the heavy lifting most of the time.  Otherwise,
             * the parsing will resume on hasNext(), which seems like it ought to be a light-weight operation. */
            this.next = tryParseNext();

            return res;
        }

    };
}

From source file:de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.java

public MultiValueHashMap<String, String> getPageTitle2Redirects(InputStream sWikipediaDump)
        throws FileNotFoundException, XMLStreamException {
    // <text xml:space="preserve">#REDIRECT [[Autopoiesis]]</text>
    // <text xml:space="preserve">#REDIRECT:[[Hans Leo Haler]]</text>
    // <text xml:space="preserve">#redirect [[Weier Hai]]</text>
    // #weiterleitung
    // <page>
    // <title>Autopoiesis</title>

    Logger.getLogger(WikipediaDumpParser.class.getName()).info("will collect redirects from wikipedia dump...");

    MultiValueHashMap<String, String> hsPageTitle2Redirects = new MultiValueBalancedTreeMap<String, String>();

    String strCurrentTitle = "";
    XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();

    XMLEventReader xmlEventReader = xmlInputFactory.createXMLEventReader(sWikipediaDump, "Utf-8");
    int iTitlesRead = 0;
    while (xmlEventReader.hasNext()) {
        XMLEvent xmlEvent = xmlEventReader.nextEvent();

        if (!xmlEvent.isStartElement())
            continue;
        // wenn wir einen Title haben, dann merken wir uns den, falls wir ihn brauchen
        if (xmlEvent.asStartElement().getName().getLocalPart().equals("title")) {
            strCurrentTitle = readNextCharEventsText(xmlEventReader);

            iTitlesRead++;//from w ww . ja va  2  s . co m
            if (iTitlesRead % 200000 == 0)
                Logger.getLogger(WikipediaDumpParser.class.getName())
                        .info("read doc #" + StringUtils.beautifyNumber(iTitlesRead));

            continue;
        }

        if (!xmlEvent.asStartElement().getName().getLocalPart().equals("text"))
            continue;

        // jetzt haben wir ein text-tag. Wir schauen, ob jetzt ein redirect kommt
        // entweder kommt ein charEvent oder ein EndEvent. Leere Texte gibts wohl auch
        XMLEvent nextEvent = xmlEventReader.peek();

        if (!nextEvent.isCharacters())
            continue;

        String strCharEventData = readNextCharEventsText(xmlEventReader);
        if (strCharEventData == null)
            continue;

        strCharEventData = strCharEventData.trim();

        boolean bRedirect = false;

        if (strCharEventData.length() >= 9 && strCharEventData.substring(0, 9).equalsIgnoreCase("#redirect"))
            bRedirect = true;
        if (!bRedirect && strCharEventData.length() >= 8
                && strCharEventData.substring(0, 8).equalsIgnoreCase("redirect")
                && !strCharEventData.contains("\n"))
            bRedirect = true;
        if (!bRedirect && strCharEventData.length() >= 14
                && strCharEventData.substring(0, 14).equalsIgnoreCase("#weiterleitung"))
            bRedirect = true;
        if (!bRedirect && strCharEventData.length() >= 13
                && strCharEventData.substring(0, 13).equalsIgnoreCase("weiterleitung")
                && !strCharEventData.contains("\n"))
            bRedirect = true;

        if (!bRedirect)
            continue;

        // wir haben einen redirect - der wird in unsere Datenstruktur eingetragen
        int iStart = strCharEventData.indexOf("[[");
        int iEnd = strCharEventData.indexOf("]]");
        if (iStart < 0 || iEnd < 0)
            continue;
        if (iEnd <= iStart)
            continue;
        if ((iStart + 2) > strCharEventData.length() || iEnd > strCharEventData.length())
            continue;

        String strRedirectTarget = strCharEventData.substring(iStart + 2, iEnd).trim();
        hsPageTitle2Redirects.add(strRedirectTarget, strCurrentTitle);

        // if("Venceslav Konstantinov".equalsIgnoreCase(strCurrentTitle) || "Venceslav Konstantinov".equalsIgnoreCase(strRedirectTarget))
        // System.out.println("redirect found: (" + hsPageTitle2Redirects.keySize() + ") " + strCurrentTitle + " => '" + strRedirectTarget + "'");

    }

    Logger.getLogger(WikipediaDumpParser.class.getName())
            .info("Redirects found: " + StringUtils.beautifyNumber(hsPageTitle2Redirects.valueSize()));

    return hsPageTitle2Redirects;

}

From source file:de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.java

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    try {// w w  w  . j a v a2 s  .  c  om

        // wir iterieren schn ber die page-Eintrge. Darin gibt es dann title, timestamp, <contributor> => <username> und text. den text mssen
        // wir noch bereinigen. dazu nehmen wir eine Vorverarbeitung mit bliki - dazu mssen wir aber selbst nochmal den String vorbereiten und
        // nachbereinigen. Leider.

        WikipediaDumpParserConfig wikipediaDumpParserConfig = context.get(WikipediaDumpParserConfig.class);

        if (wikipediaDumpParserConfig == null) {
            Logger.getLogger(WikipediaDumpParser.class.getName())
                    .info("No wikipedia parser config found. Will take the default one.");
            wikipediaDumpParserConfig = new WikipediaDumpParserConfig();
        }

        TikaInputStream tikaStream = TikaInputStream.get(stream);

        File fWikipediaDumpFile4Stream = tikaStream.getFile();

        MultiValueHashMap<String, String> hsPageTitle2Redirects = new MultiValueHashMap<String, String>();
        if (wikipediaDumpParserConfig.determinePageRedirects)
            hsPageTitle2Redirects = getPageTitle2Redirects(new FileInputStream(fWikipediaDumpFile4Stream));

        HashSet<String> hsRedirectPageTitles = new HashSet<String>(hsPageTitle2Redirects.values());

        String strCleanedText = "";
        String strBaseURL = null;

        XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
        XMLEventReader xmlEventReader = xmlInputFactory
                .createXMLEventReader(new FileInputStream(fWikipediaDumpFile4Stream), "Utf-8");
        while (xmlEventReader.hasNext()) {

            XMLEvent xmlEvent = xmlEventReader.nextEvent();

            if (xmlEvent.isEndElement() && xmlEvent.asEndElement().getName().getLocalPart().equals("page")) {
                if (metadata.size() == 0)
                    continue;

                // den mimetype wollen wir auch noch in den Metadaten haben
                metadata.add(Metadata.CONTENT_TYPE, "application/wikipedia+xml");

                XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
                xhtml.startDocument();

                xhtml.startElement("p");
                xhtml.characters(strCleanedText.toCharArray(), 0, strCleanedText.length());
                xhtml.endElement("p");

                xhtml.endDocument();

            }

            if (!xmlEvent.isStartElement())
                continue;

            // ##### die siteinfo

            if (strBaseURL == null && xmlEvent.asStartElement().getName().getLocalPart().equals("base")) {
                // http://de.wikipedia.org/wiki/Wikipedia:Hauptseite =>http://de.wikipedia.org/wiki/
                strBaseURL = readNextCharEventsText(xmlEventReader);
                strBaseURL = strBaseURL.substring(0, strBaseURL.lastIndexOf("/") + 1);
            }

            // ##### die page

            if (xmlEvent.asStartElement().getName().getLocalPart().equals("page")) {
                for (String strKey : metadata.names())
                    metadata.remove(strKey);
            }

            // ##### der Title

            if (xmlEvent.asStartElement().getName().getLocalPart().equals("title")) {
                // wir merken uns immer den aktuellen Titel
                String strCurrentTitle = readNextCharEventsText(xmlEventReader);

                if (strCurrentTitle.equalsIgnoreCase("DuckDuckGo")) {
                    int fasd = 8;
                }

                if (strCurrentTitle.toLowerCase().contains("duck")
                        && strCurrentTitle.toLowerCase().contains("go")) {
                    int is = 666;
                }

                // wenn der Titel eine redirect-Page ist, dann tragen wir die ganze Page aus der EventQueue aus, springen an das endPage, und
                // haben somit diese Seite ignoriert. Ferner ignorieren wir auch spezielle wikipedia-Seiten
                String strSmallTitle = strCurrentTitle.trim().toLowerCase();
                if (hsRedirectPageTitles.contains(strCurrentTitle)
                        || hsRedirectPageTitles.contains(strSmallTitle)
                        || hsRedirectPageTitles.contains(strCurrentTitle.trim())
                        || strSmallTitle.startsWith("category:") || strSmallTitle.startsWith("kategorie:")
                        || strSmallTitle.startsWith("vorlage:") || strSmallTitle.startsWith("template:")
                        || strSmallTitle.startsWith("hilfe:") || strSmallTitle.startsWith("help:")
                        || strSmallTitle.startsWith("wikipedia:") || strSmallTitle.startsWith("portal:")
                        || strSmallTitle.startsWith("mediawiki:")) {

                    while (true) {
                        XMLEvent nextXmlEvent = xmlEventReader.nextEvent();
                        if (nextXmlEvent.isEndElement()
                                && nextXmlEvent.asEndElement().getName().getLocalPart().equals("page"))
                            break;
                    }
                } else {
                    metadata.add(Metadata.TITLE, strCurrentTitle);
                    metadata.add(Metadata.SOURCE, strBaseURL + strCurrentTitle);

                    for (String strRedirect : hsPageTitle2Redirects.get(strCurrentTitle)) {
                        // wir ignorieren Titel, die sich lediglich durch gro/kleinschreibung unterscheiden
                        if (!StringUtils.containsIgnoreCase(strRedirect, metadata.getValues(Metadata.TITLE)))
                            metadata.add(Metadata.TITLE, strRedirect);
                    }
                }

                continue;
            }

            // ##### der text
            if (xmlEvent.asStartElement().getName().getLocalPart().equals("text")) {
                String strText = readNextCharEventsText(xmlEventReader);

                if (wikipediaDumpParserConfig.parseLinksAndCategories)
                    parseLinksAndCategories(strText, strBaseURL, metadata, handler);
                if (wikipediaDumpParserConfig.parseInfoBoxes)
                    parseInfoBox(strText, metadata, handler);
                if (wikipediaDumpParserConfig.parseGeoCoordinates)
                    parseGeoCoordinates(strText, metadata);

                // aufgrund einiger Defizite in dem verwendeten cleaner mssen wir hier leider noch zu-und nacharbeiten
                strText = strText.replaceAll("==\n", "==\n\n");
                strText = strText.replaceAll("\n==", "\n\n==");

                strCleanedText = m_wikiModel.render(new PlainTextConverter(), strText);

                strCleanedText = strCleanedText.replaceAll("\\{\\{", " ");
                strCleanedText = strCleanedText.replaceAll("\\}\\}", " ");

                strCleanedText = StringEscapeUtils.unescapeHtml4(strCleanedText);

                continue;
            }

            // ##### der timestamp
            if (xmlEvent.asStartElement().getName().getLocalPart().equals("timestamp")) {
                String strTimestamp = readNextCharEventsText(xmlEventReader);

                metadata.add(Metadata.MODIFIED, strTimestamp);

                continue;
            }

            // ##### der username
            if (xmlEvent.asStartElement().getName().getLocalPart().equals("username")) {
                String strUsername = readNextCharEventsText(xmlEventReader);

                metadata.add(Metadata.CREATOR, strUsername);

                continue;
            }

        }

    } catch (Exception e) {
        Logger.getLogger(WikipediaDumpParser.class.getName()).log(Level.SEVERE, "Error", e);
    }

}

From source file:ca.phon.session.io.xml.v12.XMLSessionReader_v12.java

@Override
public boolean canRead(File file) throws IOException {
    // open file and make sure the first
    // element is 'session' with the correct version
    boolean canRead = false;

    // use StAX to read only first element
    // create StAX reader
    XMLInputFactory factory = XMLInputFactory.newInstance();
    XMLEventReader reader = null;
    try (FileInputStream source = new FileInputStream(file)) {
        //BufferedReader in = new BufferedReader(new InputStreamReader(source, "UTF-8"));
        XMLEventReader xmlReader = factory.createXMLEventReader(source, "UTF-8");
        reader = factory.createFilteredReader(xmlReader, new XMLWhitespaceFilter());

        XMLEvent evt;/*from  w  w  w  .jav  a2  s  . com*/
        while (!(evt = reader.nextEvent()).isStartElement())
            ;
        canRead = evt.asStartElement().getName().getLocalPart().equals("session")
                && evt.asStartElement().getAttributeByName(new QName("version")).getValue().equals("PB1.2");
    } catch (XMLStreamException e) {
        throw new IOException(e);
    }

    return canRead;
}