Example usage for javax.xml.stream XMLEventReader nextEvent

Introduction

In this page you can find the example usage for javax.xml.stream XMLEventReader nextEvent.

Prototype

public XMLEvent nextEvent() throws XMLStreamException;

Source Link

Document

Gets the next XMLEvent.

Usage

From source file:de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.java

public MultiValueHashMap<String, String> getPageTitle2Redirects(InputStream sWikipediaDump)
        throws FileNotFoundException, XMLStreamException {
    // <text xml:space="preserve">#REDIRECT [[Autopoiesis]]</text>
    // <text xml:space="preserve">#REDIRECT:[[Hans Leo Haler]]</text>
    // <text xml:space="preserve">#redirect [[Weier Hai]]</text>
    // #weiterleitung
    // <page>
    // <title>Autopoiesis</title>

    Logger.getLogger(WikipediaDumpParser.class.getName()).info("will collect redirects from wikipedia dump...");

    MultiValueHashMap<String, String> hsPageTitle2Redirects = new MultiValueBalancedTreeMap<String, String>();

    String strCurrentTitle = "";
    XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();

    XMLEventReader xmlEventReader = xmlInputFactory.createXMLEventReader(sWikipediaDump, "Utf-8");
    int iTitlesRead = 0;
    while (xmlEventReader.hasNext()) {
        XMLEvent xmlEvent = xmlEventReader.nextEvent();

        if (!xmlEvent.isStartElement())
            continue;
        // wenn wir einen Title haben, dann merken wir uns den, falls wir ihn brauchen
        if (xmlEvent.asStartElement().getName().getLocalPart().equals("title")) {
            strCurrentTitle = readNextCharEventsText(xmlEventReader);

            iTitlesRead++;/* ww  w  . j av  a 2  s .  co  m*/
            if (iTitlesRead % 200000 == 0)
                Logger.getLogger(WikipediaDumpParser.class.getName())
                        .info("read doc #" + StringUtils.beautifyNumber(iTitlesRead));

            continue;
        }

        if (!xmlEvent.asStartElement().getName().getLocalPart().equals("text"))
            continue;

        // jetzt haben wir ein text-tag. Wir schauen, ob jetzt ein redirect kommt
        // entweder kommt ein charEvent oder ein EndEvent. Leere Texte gibts wohl auch
        XMLEvent nextEvent = xmlEventReader.peek();

        if (!nextEvent.isCharacters())
            continue;

        String strCharEventData = readNextCharEventsText(xmlEventReader);
        if (strCharEventData == null)
            continue;

        strCharEventData = strCharEventData.trim();

        boolean bRedirect = false;

        if (strCharEventData.length() >= 9 && strCharEventData.substring(0, 9).equalsIgnoreCase("#redirect"))
            bRedirect = true;
        if (!bRedirect && strCharEventData.length() >= 8
                && strCharEventData.substring(0, 8).equalsIgnoreCase("redirect")
                && !strCharEventData.contains("\n"))
            bRedirect = true;
        if (!bRedirect && strCharEventData.length() >= 14
                && strCharEventData.substring(0, 14).equalsIgnoreCase("#weiterleitung"))
            bRedirect = true;
        if (!bRedirect && strCharEventData.length() >= 13
                && strCharEventData.substring(0, 13).equalsIgnoreCase("weiterleitung")
                && !strCharEventData.contains("\n"))
            bRedirect = true;

        if (!bRedirect)
            continue;

        // wir haben einen redirect - der wird in unsere Datenstruktur eingetragen
        int iStart = strCharEventData.indexOf("[[");
        int iEnd = strCharEventData.indexOf("]]");
        if (iStart < 0 || iEnd < 0)
            continue;
        if (iEnd <= iStart)
            continue;
        if ((iStart + 2) > strCharEventData.length() || iEnd > strCharEventData.length())
            continue;

        String strRedirectTarget = strCharEventData.substring(iStart + 2, iEnd).trim();
        hsPageTitle2Redirects.add(strRedirectTarget, strCurrentTitle);

        // if("Venceslav Konstantinov".equalsIgnoreCase(strCurrentTitle) || "Venceslav Konstantinov".equalsIgnoreCase(strRedirectTarget))
        // System.out.println("redirect found: (" + hsPageTitle2Redirects.keySize() + ") " + strCurrentTitle + " => '" + strRedirectTarget + "'");

    }

    Logger.getLogger(WikipediaDumpParser.class.getName())
            .info("Redirects found: " + StringUtils.beautifyNumber(hsPageTitle2Redirects.valueSize()));

    return hsPageTitle2Redirects;

}

From source file:json_to_xml_1.java

public int execute(String args[]) throws ProgramTerminationException {
    this.getInfoMessages().clear();

    if (args.length < 2) {
        throw constructTermination("messageArgumentsMissing", null,
                getI10nString("messageArgumentsMissingUsage") + "\n\tjson_to_xml_1 "
                        + getI10nString("messageParameterList") + "\n");
    }/*w  w w  . j a v  a2  s. c  o  m*/

    File resultInfoFile = new File(args[1]);

    try {
        resultInfoFile = resultInfoFile.getCanonicalFile();
    } catch (SecurityException ex) {
        throw constructTermination("messageResultInfoFileCantGetCanonicalPath", ex, null,
                resultInfoFile.getAbsolutePath());
    } catch (IOException ex) {
        throw constructTermination("messageResultInfoFileCantGetCanonicalPath", ex, null,
                resultInfoFile.getAbsolutePath());
    }

    if (resultInfoFile.exists() == true) {
        if (resultInfoFile.isFile() == true) {
            if (resultInfoFile.canWrite() != true) {
                throw constructTermination("messageResultInfoFileIsntWritable", null, null,
                        resultInfoFile.getAbsolutePath());
            }
        } else {
            throw constructTermination("messageResultInfoPathIsntAFile", null, null,
                    resultInfoFile.getAbsolutePath());
        }
    }

    json_to_xml_1.resultInfoFile = resultInfoFile;

    File jobFile = new File(args[0]);

    try {
        jobFile = jobFile.getCanonicalFile();
    } catch (SecurityException ex) {
        throw constructTermination("messageJobFileCantGetCanonicalPath", ex, null, jobFile.getAbsolutePath());
    } catch (IOException ex) {
        throw constructTermination("messageJobFileCantGetCanonicalPath", ex, null, jobFile.getAbsolutePath());
    }

    if (jobFile.exists() != true) {
        throw constructTermination("messageJobFileDoesntExist", null, null, jobFile.getAbsolutePath());
    }

    if (jobFile.isFile() != true) {
        throw constructTermination("messageJobPathIsntAFile", null, null, jobFile.getAbsolutePath());
    }

    if (jobFile.canRead() != true) {
        throw constructTermination("messageJobFileIsntReadable", null, null, jobFile.getAbsolutePath());
    }

    System.out.println("json_to_xml_1: " + getI10nStringFormatted("messageCallDetails",
            jobFile.getAbsolutePath(), resultInfoFile.getAbsolutePath()));

    File inputFile = null;
    File outputFile = null;

    try {
        XMLInputFactory inputFactory = XMLInputFactory.newInstance();
        InputStream in = new FileInputStream(jobFile);
        XMLEventReader eventReader = inputFactory.createXMLEventReader(in);

        while (eventReader.hasNext() == true) {
            XMLEvent event = eventReader.nextEvent();

            if (event.isStartElement() == true) {
                String tagName = event.asStartElement().getName().getLocalPart();

                if (tagName.equals("json-input-file") == true) {
                    StartElement inputFileElement = event.asStartElement();
                    Attribute pathAttribute = inputFileElement.getAttributeByName(new QName("path"));

                    if (pathAttribute == null) {
                        throw constructTermination("messageJobFileEntryIsMissingAnAttribute", null, null,
                                jobFile.getAbsolutePath(), tagName, "path");
                    }

                    String inputFilePath = pathAttribute.getValue();

                    if (inputFilePath.isEmpty() == true) {
                        throw constructTermination("messageJobFileAttributeValueIsEmpty", null, null,
                                jobFile.getAbsolutePath(), tagName, "path");
                    }

                    inputFile = new File(inputFilePath);

                    if (inputFile.isAbsolute() != true) {
                        inputFile = new File(
                                jobFile.getAbsoluteFile().getParent() + File.separator + inputFilePath);
                    }

                    try {
                        inputFile = inputFile.getCanonicalFile();
                    } catch (SecurityException ex) {
                        throw constructTermination("messageInputFileCantGetCanonicalPath", ex, null,
                                new File(inputFilePath).getAbsolutePath(), jobFile.getAbsolutePath());
                    } catch (IOException ex) {
                        throw constructTermination("messageInputFileCantGetCanonicalPath", ex, null,
                                new File(inputFilePath).getAbsolutePath(), jobFile.getAbsolutePath());
                    }

                    if (inputFile.exists() != true) {
                        throw constructTermination("messageInputFileDoesntExist", null, null,
                                inputFile.getAbsolutePath(), jobFile.getAbsolutePath());
                    }

                    if (inputFile.isFile() != true) {
                        throw constructTermination("messageInputPathIsntAFile", null, null,
                                inputFile.getAbsolutePath(), jobFile.getAbsolutePath());
                    }

                    if (inputFile.canRead() != true) {
                        throw constructTermination("messageInputFileIsntReadable", null, null,
                                inputFile.getAbsolutePath(), jobFile.getAbsolutePath());
                    }
                } else if (tagName.equals("xml-output-file") == true) {
                    StartElement outputFileElement = event.asStartElement();
                    Attribute pathAttribute = outputFileElement.getAttributeByName(new QName("path"));

                    if (pathAttribute == null) {
                        throw constructTermination("messageJobFileEntryIsMissingAnAttribute", null, null,
                                jobFile.getAbsolutePath(), tagName, "path");
                    }

                    String outputFilePath = pathAttribute.getValue();

                    if (outputFilePath.isEmpty() == true) {
                        throw constructTermination("messageJobFileAttributeValueIsEmpty", null, null,
                                jobFile.getAbsolutePath(), tagName, "path");
                    }

                    outputFile = new File(outputFilePath);

                    if (outputFile.isAbsolute() != true) {
                        outputFile = new File(
                                jobFile.getAbsoluteFile().getParent() + File.separator + outputFilePath);
                    }

                    try {
                        outputFile = outputFile.getCanonicalFile();
                    } catch (SecurityException ex) {
                        throw constructTermination("messageOutputFileCantGetCanonicalPath", ex, null,
                                new File(outputFilePath).getAbsolutePath(), jobFile.getAbsolutePath());
                    } catch (IOException ex) {
                        throw constructTermination("messageOutputFileCantGetCanonicalPath", ex, null,
                                new File(outputFilePath).getAbsolutePath(), jobFile.getAbsolutePath());
                    }

                    if (outputFile.exists() == true) {
                        if (outputFile.isFile() == true) {
                            if (outputFile.canWrite() != true) {
                                throw constructTermination("messageOutputFileIsntWritable", null, null,
                                        outputFile.getAbsolutePath());
                            }
                        } else {
                            throw constructTermination("messageOutputPathIsntAFile", null, null,
                                    outputFile.getAbsolutePath());
                        }
                    }
                }
            }
        }
    } catch (XMLStreamException ex) {
        throw constructTermination("messageJobFileErrorWhileReading", ex, null, jobFile.getAbsolutePath());
    } catch (SecurityException ex) {
        throw constructTermination("messageJobFileErrorWhileReading", ex, null, jobFile.getAbsolutePath());
    } catch (IOException ex) {
        throw constructTermination("messageJobFileErrorWhileReading", ex, null, jobFile.getAbsolutePath());
    }

    if (inputFile == null) {
        throw constructTermination("messageJobFileNoInputFile", null, null, jobFile.getAbsolutePath());
    }

    if (outputFile == null) {
        throw constructTermination("messageJobFileNoOutputFile", null, null, jobFile.getAbsolutePath());
    }

    StringBuilder stringBuilder = new StringBuilder();

    try {
        JSONObject json = new JSONObject(new JSONTokener(new BufferedReader(new FileReader(inputFile))));

        stringBuilder.append(XML.toString(json));
    } catch (Exception ex) {
        throw constructTermination("messageConversionError", ex, null, inputFile.getAbsolutePath());
    }

    try {
        BufferedWriter writer = new BufferedWriter(
                new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8"));

        writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
        writer.write(
                "<!-- This file was created by json_to_xml_1, which is free software licensed under the GNU Affero General Public License 3 or any later version (see https://github.com/publishing-systems/digital_publishing_workflow_tools/ and http://www.publishing-systems.org). -->\n");
        writer.write(stringBuilder.toString());

        writer.flush();
        writer.close();
    } catch (FileNotFoundException ex) {
        throw constructTermination("messageOutputFileWritingError", ex, null, outputFile.getAbsolutePath());
    } catch (UnsupportedEncodingException ex) {
        throw constructTermination("messageOutputFileWritingError", ex, null, outputFile.getAbsolutePath());
    } catch (IOException ex) {
        throw constructTermination("messageOutputFileWritingError", ex, null, outputFile.getAbsolutePath());
    }

    return 0;
}

From source file:sapience.injectors.stax.inject.StringBasedStaxStreamInjector.java

/**
 * If the reference is more then a simple attribute, we have to add new XML (subtree) to the stream. We transform
 * the reference into an InputStream and invoke another SAX parsing process for it. But the parsed events are added
 * to the main XMLEventWriter. /*w  w w . ja v  a 2 s. c  o m*/
 *
 * @param w
 * @param string
 * @throws XMLStreamException 
 * @throws XMLStreamException
 */
private void createEventsForElement(XMLEventWriter w, Reference ref) throws XMLStreamException {
    XMLEventReader r = null;
    try {
        StringBuilder target = new StringBuilder(ref.getTarget().toString());

        NamespaceContext c = w.getNamespaceContext();

        // process namespaces
        //processNamespace(target, w.getNamespaceContext());

        ByteArrayInputStream bais = new ByteArrayInputStream(target.toString().getBytes());
        this.inFac.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false);
        r = this.inFac.createXMLEventReader(bais);
        // start a new line

        while (r.hasNext()) {
            XMLEvent e = r.nextEvent();
            switch (e.getEventType()) {
            case XMLEvent.START_DOCUMENT:
                break;
            case XMLEvent.END_DOCUMENT:
                break;
            default:
                w.add(e);
                break;
            }
        }
    } finally {
        ;

        if (r != null)
            r.close();
    }

}

From source file:de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.java

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    try {/*  w w w  . j  av  a2s.c om*/

        // wir iterieren schn ber die page-Eintrge. Darin gibt es dann title, timestamp, <contributor> => <username> und text. den text mssen
        // wir noch bereinigen. dazu nehmen wir eine Vorverarbeitung mit bliki - dazu mssen wir aber selbst nochmal den String vorbereiten und
        // nachbereinigen. Leider.

        WikipediaDumpParserConfig wikipediaDumpParserConfig = context.get(WikipediaDumpParserConfig.class);

        if (wikipediaDumpParserConfig == null) {
            Logger.getLogger(WikipediaDumpParser.class.getName())
                    .info("No wikipedia parser config found. Will take the default one.");
            wikipediaDumpParserConfig = new WikipediaDumpParserConfig();
        }

        TikaInputStream tikaStream = TikaInputStream.get(stream);

        File fWikipediaDumpFile4Stream = tikaStream.getFile();

        MultiValueHashMap<String, String> hsPageTitle2Redirects = new MultiValueHashMap<String, String>();
        if (wikipediaDumpParserConfig.determinePageRedirects)
            hsPageTitle2Redirects = getPageTitle2Redirects(new FileInputStream(fWikipediaDumpFile4Stream));

        HashSet<String> hsRedirectPageTitles = new HashSet<String>(hsPageTitle2Redirects.values());

        String strCleanedText = "";
        String strBaseURL = null;

        XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
        XMLEventReader xmlEventReader = xmlInputFactory
                .createXMLEventReader(new FileInputStream(fWikipediaDumpFile4Stream), "Utf-8");
        while (xmlEventReader.hasNext()) {

            XMLEvent xmlEvent = xmlEventReader.nextEvent();

            if (xmlEvent.isEndElement() && xmlEvent.asEndElement().getName().getLocalPart().equals("page")) {
                if (metadata.size() == 0)
                    continue;

                // den mimetype wollen wir auch noch in den Metadaten haben
                metadata.add(Metadata.CONTENT_TYPE, "application/wikipedia+xml");

                XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
                xhtml.startDocument();

                xhtml.startElement("p");
                xhtml.characters(strCleanedText.toCharArray(), 0, strCleanedText.length());
                xhtml.endElement("p");

                xhtml.endDocument();

            }

            if (!xmlEvent.isStartElement())
                continue;

            // ##### die siteinfo

            if (strBaseURL == null && xmlEvent.asStartElement().getName().getLocalPart().equals("base")) {
                // http://de.wikipedia.org/wiki/Wikipedia:Hauptseite =>http://de.wikipedia.org/wiki/
                strBaseURL = readNextCharEventsText(xmlEventReader);
                strBaseURL = strBaseURL.substring(0, strBaseURL.lastIndexOf("/") + 1);
            }

            // ##### die page

            if (xmlEvent.asStartElement().getName().getLocalPart().equals("page")) {
                for (String strKey : metadata.names())
                    metadata.remove(strKey);
            }

            // ##### der Title

            if (xmlEvent.asStartElement().getName().getLocalPart().equals("title")) {
                // wir merken uns immer den aktuellen Titel
                String strCurrentTitle = readNextCharEventsText(xmlEventReader);

                if (strCurrentTitle.equalsIgnoreCase("DuckDuckGo")) {
                    int fasd = 8;
                }

                if (strCurrentTitle.toLowerCase().contains("duck")
                        && strCurrentTitle.toLowerCase().contains("go")) {
                    int is = 666;
                }

                // wenn der Titel eine redirect-Page ist, dann tragen wir die ganze Page aus der EventQueue aus, springen an das endPage, und
                // haben somit diese Seite ignoriert. Ferner ignorieren wir auch spezielle wikipedia-Seiten
                String strSmallTitle = strCurrentTitle.trim().toLowerCase();
                if (hsRedirectPageTitles.contains(strCurrentTitle)
                        || hsRedirectPageTitles.contains(strSmallTitle)
                        || hsRedirectPageTitles.contains(strCurrentTitle.trim())
                        || strSmallTitle.startsWith("category:") || strSmallTitle.startsWith("kategorie:")
                        || strSmallTitle.startsWith("vorlage:") || strSmallTitle.startsWith("template:")
                        || strSmallTitle.startsWith("hilfe:") || strSmallTitle.startsWith("help:")
                        || strSmallTitle.startsWith("wikipedia:") || strSmallTitle.startsWith("portal:")
                        || strSmallTitle.startsWith("mediawiki:")) {

                    while (true) {
                        XMLEvent nextXmlEvent = xmlEventReader.nextEvent();
                        if (nextXmlEvent.isEndElement()
                                && nextXmlEvent.asEndElement().getName().getLocalPart().equals("page"))
                            break;
                    }
                } else {
                    metadata.add(Metadata.TITLE, strCurrentTitle);
                    metadata.add(Metadata.SOURCE, strBaseURL + strCurrentTitle);

                    for (String strRedirect : hsPageTitle2Redirects.get(strCurrentTitle)) {
                        // wir ignorieren Titel, die sich lediglich durch gro/kleinschreibung unterscheiden
                        if (!StringUtils.containsIgnoreCase(strRedirect, metadata.getValues(Metadata.TITLE)))
                            metadata.add(Metadata.TITLE, strRedirect);
                    }
                }

                continue;
            }

            // ##### der text
            if (xmlEvent.asStartElement().getName().getLocalPart().equals("text")) {
                String strText = readNextCharEventsText(xmlEventReader);

                if (wikipediaDumpParserConfig.parseLinksAndCategories)
                    parseLinksAndCategories(strText, strBaseURL, metadata, handler);
                if (wikipediaDumpParserConfig.parseInfoBoxes)
                    parseInfoBox(strText, metadata, handler);
                if (wikipediaDumpParserConfig.parseGeoCoordinates)
                    parseGeoCoordinates(strText, metadata);

                // aufgrund einiger Defizite in dem verwendeten cleaner mssen wir hier leider noch zu-und nacharbeiten
                strText = strText.replaceAll("==\n", "==\n\n");
                strText = strText.replaceAll("\n==", "\n\n==");

                strCleanedText = m_wikiModel.render(new PlainTextConverter(), strText);

                strCleanedText = strCleanedText.replaceAll("\\{\\{", " ");
                strCleanedText = strCleanedText.replaceAll("\\}\\}", " ");

                strCleanedText = StringEscapeUtils.unescapeHtml4(strCleanedText);

                continue;
            }

            // ##### der timestamp
            if (xmlEvent.asStartElement().getName().getLocalPart().equals("timestamp")) {
                String strTimestamp = readNextCharEventsText(xmlEventReader);

                metadata.add(Metadata.MODIFIED, strTimestamp);

                continue;
            }

            // ##### der username
            if (xmlEvent.asStartElement().getName().getLocalPart().equals("username")) {
                String strUsername = readNextCharEventsText(xmlEventReader);

                metadata.add(Metadata.CREATOR, strUsername);

                continue;
            }

        }

    } catch (Exception e) {
        Logger.getLogger(WikipediaDumpParser.class.getName()).log(Level.SEVERE, "Error", e);
    }

}

From source file:ca.phon.session.io.xml.v12.XMLSessionReader_v12.java

@Override
public boolean canRead(File file) throws IOException {
    // open file and make sure the first
    // element is 'session' with the correct version
    boolean canRead = false;

    // use StAX to read only first element
    // create StAX reader
    XMLInputFactory factory = XMLInputFactory.newInstance();
    XMLEventReader reader = null;
    try (FileInputStream source = new FileInputStream(file)) {
        //BufferedReader in = new BufferedReader(new InputStreamReader(source, "UTF-8"));
        XMLEventReader xmlReader = factory.createXMLEventReader(source, "UTF-8");
        reader = factory.createFilteredReader(xmlReader, new XMLWhitespaceFilter());

        XMLEvent evt;//from www .  ja v  a 2 s. c o  m
        while (!(evt = reader.nextEvent()).isStartElement())
            ;
        canRead = evt.asStartElement().getName().getLocalPart().equals("session")
                && evt.asStartElement().getAttributeByName(new QName("version")).getValue().equals("PB1.2");
    } catch (XMLStreamException e) {
        throw new IOException(e);
    }

    return canRead;
}

From source file:com.msopentech.odatajclient.testservice.utils.XMLUtilities.java

private int countFeedElements(final InputStream is, final String elementName) throws XMLStreamException {
    final XMLEventReader reader = getEventReader(is);

    int count = 0;

    while (reader.hasNext()) {
        final XMLEvent event = reader.nextEvent();

        if (event.getEventType() == XMLStreamConstants.START_ELEMENT
                && elementName.equals(event.asStartElement().getName().getLocalPart())) {
            count++;//from www .j  av  a2 s  .co m
        }
    }

    reader.close();
    return count;
}

From source file:edu.jhu.hlt.concrete.ingesters.webposts.WebPostIngester.java

@Override
public Communication fromCharacterBasedFile(final Path path) throws IngestException {
    if (!Files.exists(path))
        throw new IngestException("No file at: " + path.toString());

    AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory();
    AnalyticUUIDGenerator g = f.create();
    Communication c = new Communication();
    c.setUuid(g.next());//from   w  w  w . ja v a  2 s  .co  m
    c.setType(this.getKind());
    c.setMetadata(TooledMetadataConverter.convert(this));

    try {
        ExistingNonDirectoryFile ef = new ExistingNonDirectoryFile(path);
        c.setId(ef.getName().split("\\.")[0]);
    } catch (NoSuchFileException | NotFileException e) {
        // might throw if path is a directory.
        throw new IngestException(path.toString() + " is not a file, or is a directory.");
    }

    String content;
    try (InputStream is = Files.newInputStream(path);
            BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);) {
        content = IOUtils.toString(bin, StandardCharsets.UTF_8);
        c.setText(content);
    } catch (IOException e) {
        throw new IngestException(e);
    }

    try (InputStream is = Files.newInputStream(path);
            BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);
            BufferedReader reader = new BufferedReader(new InputStreamReader(bin, StandardCharsets.UTF_8));) {
        XMLEventReader rdr = null;
        try {
            rdr = inF.createXMLEventReader(reader);

            // Below method moves the reader
            // to the headline end element.
            Section headline = this.handleBeginning(rdr, content, c);
            headline.setUuid(g.next());
            c.addToSectionList(headline);
            TextSpan sts = headline.getTextSpan();
            LOGGER.debug("headline text: {}", c.getText().substring(sts.getStart(), sts.getEnding()));

            int sectNumber = 1;
            int subSect = 0;

            int currOff = -1;
            // Big amounts of characters.
            while (rdr.hasNext()) {
                XMLEvent nextEvent = rdr.nextEvent();
                currOff = nextEvent.getLocation().getCharacterOffset();

                // First: see if document is going to end.
                // If yes: exit.
                if (nextEvent.isEndDocument())
                    break;

                // region
                // enables ingestion of quotes inside a usenet webpost.
                // by Tongfei Chen
                if (nextEvent.isStartElement()
                        && nextEvent.asStartElement().getName().equals(QName.valueOf("QUOTE"))) {
                    Attribute attrQuote = nextEvent.asStartElement()
                            .getAttributeByName(QName.valueOf("PREVIOUSPOST"));
                    String quote = StringEscapeUtils.escapeXml(attrQuote.getValue());
                    int location = attrQuote.getLocation().getCharacterOffset()
                            + "<QUOTE PREVIOUSPOST=\"".length();
                    Section quoteSection = new Section(g.next(), "quote")
                            .setTextSpan(new TextSpan(location, location + quote.length()));
                    c.addToSectionList(quoteSection);
                }
                // endregion

                // Check if start element.
                if (nextEvent.isCharacters()) {
                    Characters chars = nextEvent.asCharacters();
                    if (!chars.isWhiteSpace()) {
                        String fpContent = chars.getData();
                        LOGGER.debug("Character offset: {}", currOff);
                        LOGGER.debug("Character based data: {}", fpContent);

                        SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(fpContent);
                        final int tsb = currOff + pads.getKey();

                        final int tse = currOff + fpContent.replace("\"", "&quot;").replace("<", "&lt;")
                                .replace(">", "&gt;").length() - (pads.getValue());
                        // MAINTAIN CORRECT TEXT SPAN
                        // CANNOT USE StringEscapeUtils.escapeXml because it will escape "'", which
                        // is not escaped in the data
                        // @tongfei

                        LOGGER.debug("Section text: {}", content.substring(tsb, tse));
                        TextSpan ts = new TextSpan(tsb, tse);
                        String sk;
                        if (subSect == 0)
                            sk = "poster";
                        else if (subSect == 1)
                            sk = "postdate";
                        else
                            sk = "post";

                        Section s = new Section();
                        s.setKind(sk);
                        s.setTextSpan(ts);
                        s.setUuid(g.next());
                        List<Integer> intList = new ArrayList<>();
                        intList.add(sectNumber);
                        intList.add(subSect);
                        s.setNumberList(intList);
                        c.addToSectionList(s);

                        subSect++;
                    }
                } else if (nextEvent.isEndElement()) {
                    EndElement ee = nextEvent.asEndElement();
                    currOff = ee.getLocation().getCharacterOffset();
                    QName name = ee.getName();
                    String localName = name.getLocalPart();
                    LOGGER.debug("Hit end element: {}", localName);
                    if (localName.equalsIgnoreCase(POST_LOCAL_NAME)) {
                        LOGGER.debug("Switching to new post.");
                        sectNumber++;
                        subSect = 0;
                    } else if (localName.equalsIgnoreCase(TEXT_LOCAL_NAME)) {
                        // done with document.
                        break;
                    }
                }
            }

            return c;

        } catch (XMLStreamException | ConcreteException | StringIndexOutOfBoundsException
                | ClassCastException x) {
            throw new IngestException(x);
        } finally {
            if (rdr != null)
                try {
                    rdr.close();
                } catch (XMLStreamException e) {
                    // not likely.
                    LOGGER.info("Error closing XMLReader.", e);
                }
        }
    } catch (IOException e) {
        throw new IngestException(e);
    }
}

From source file:com.evolveum.polygon.connector.hcm.DocumentProcessing.java

public Map<String, Object> parseXMLData(HcmConnectorConfiguration conf, ResultsHandler handler,
        Map<String, Object> schemaAttributeMap, Filter query) {

    XMLInputFactory factory = XMLInputFactory.newInstance();
    try {/*from w w w  .  j  a  va 2s  . c om*/

        String uidAttributeName = conf.getUidAttribute();
        String primariId = conf.getPrimaryId();
        String startName = "";
        String value = null;

        StringBuilder assignmentXMLBuilder = null;

        List<String> builderList = new ArrayList<String>();

        Integer nOfIterations = 0;
        Boolean isSubjectToQuery = false;
        Boolean isAssigment = false;
        Boolean evaluateAttr = true;
        Boolean specificAttributeQuery = false;

        XMLEventReader eventReader = factory.createXMLEventReader(new FileReader(conf.getFilePath()));
        List<String> dictionary = populateDictionary(FIRSTFLAG);

        if (!attrsToGet.isEmpty()) {

            attrsToGet.add(uidAttributeName);
            attrsToGet.add(primariId);
            specificAttributeQuery = true;
            evaluateAttr = false;
            LOGGER.ok("The uid and primary id were added to the queried attribute list");

            schemaAttributeMap = modifySchemaAttributeMap(schemaAttributeMap);
        }

        while (eventReader.hasNext()) {

            XMLEvent event = eventReader.nextEvent();

            Integer code = event.getEventType();

            if (code == XMLStreamConstants.START_ELEMENT) {

                StartElement startElement = event.asStartElement();
                startName = startElement.getName().getLocalPart();

                if (!evaluateAttr && attrsToGet.contains(startName)) {

                    evaluateAttr = true;
                }

                if (!elementIsEmployeeData) {

                    if (startName.equals(EMPLOYEES)) {

                        if (dictionary.contains(nOfIterations.toString())) {
                            LOGGER.ok("The defined number of iterations has been hit: {0}",
                                    nOfIterations.toString());
                            break;
                        } else {
                            startName = "";
                            elementIsEmployeeData = true;
                            nOfIterations++;
                        }
                    }
                } else if (evaluateAttr) {

                    if (!isAssigment) {
                        if (!ASSIGNMENTTAG.equals(startName)) {

                        } else {
                            assignmentXMLBuilder = new StringBuilder();
                            isAssigment = true;
                        }
                    } else {

                        builderList = processAssignment(startName, null, START, builderList);
                    }

                    if (multiValuedAttributesList.contains(startName)) {

                        elementIsMultiValued = true;
                    }

                }

            } else if (elementIsEmployeeData) {

                if (code == XMLStreamConstants.CHARACTERS && evaluateAttr) {

                    Characters characters = event.asCharacters();

                    if (!characters.isWhiteSpace()) {

                        StringBuilder valueBuilder;
                        if (value != null) {
                            valueBuilder = new StringBuilder(value).append("")
                                    .append(characters.getData().toString());
                        } else {
                            valueBuilder = new StringBuilder(characters.getData().toString());
                        }
                        value = valueBuilder.toString();
                        // value = StringEscapeUtils.escapeXml10(value);
                        // LOGGER.info("The attribute value for: {0} is
                        // {1}", startName, value);
                    }
                } else if (code == XMLStreamConstants.END_ELEMENT) {

                    EndElement endElement = event.asEndElement();
                    String endName = endElement.getName().getLocalPart();

                    isSubjectToQuery = checkFilter(endName, value, query, uidAttributeName);

                    if (!isSubjectToQuery) {
                        attributeMap.clear();
                        elementIsEmployeeData = false;
                        value = null;

                        endName = EMPLOYEES;
                    }

                    if (endName.equals(EMPLOYEES)) {

                        attributeMap = handleEmployeeData(attributeMap, schemaAttributeMap, handler,
                                uidAttributeName, primariId);

                        elementIsEmployeeData = false;

                    } else if (evaluateAttr) {

                        if (endName.equals(startName)) {
                            if (value != null) {

                                if (!isAssigment) {
                                    if (!elementIsMultiValued) {

                                        attributeMap.put(startName, value);
                                    } else {

                                        multiValuedAttributeBuffer.put(startName, value);
                                    }
                                } else {

                                    value = StringEscapeUtils.escapeXml10(value);
                                    builderList = processAssignment(endName, value, VALUE, builderList);

                                    builderList = processAssignment(endName, null, END, builderList);
                                }
                                // LOGGER.info("Attribute name: {0} and the
                                // Attribute value: {1}", endName, value);
                                value = null;
                            }
                        } else {
                            if (endName.equals(ASSIGNMENTTAG)) {

                                builderList = processAssignment(endName, null, CLOSE, builderList);

                                // if (assigmentIsActive) {

                                for (String records : builderList) {
                                    assignmentXMLBuilder.append(records);

                                }
                                attributeMap.put(ASSIGNMENTTAG, assignmentXMLBuilder.toString());
                                // } else {
                                // }

                                builderList = new ArrayList<String>();
                                // assigmentIsActive = false;
                                isAssigment = false;

                            } else if (multiValuedAttributesList.contains(endName)) {
                                processMultiValuedAttributes(multiValuedAttributeBuffer);
                            }
                        }

                    }
                    if (specificAttributeQuery && evaluateAttr) {

                        evaluateAttr = false;
                    }
                }
            } else if (code == XMLStreamConstants.END_DOCUMENT) {
                handleBufferedData(uidAttributeName, primariId, handler);
            }
        }

    } catch (FileNotFoundException e) {
        StringBuilder errorBuilder = new StringBuilder("File not found at the specified path.")
                .append(e.getLocalizedMessage());
        LOGGER.error("File not found at the specified path: {0}", e);
        throw new ConnectorIOException(errorBuilder.toString());
    } catch (XMLStreamException e) {

        LOGGER.error("Unexpected processing error while parsing the .xml document : {0}", e);

        StringBuilder errorBuilder = new StringBuilder(
                "Unexpected processing error while parsing the .xml document. ")
                        .append(e.getLocalizedMessage());

        throw new ConnectorIOException(errorBuilder.toString());
    }
    return attributeMap;

}

From source file:com.msopentech.odatajclient.testservice.utils.XMLUtilities.java

private void addAtomElement(final InputStream content, final XMLEventWriter writer) throws Exception {
    final XMLEventReader reader = getEventReader(content);

    final XMLEventFactory eventFactory = XMLEventFactory.newInstance();
    XMLEvent newLine = eventFactory.createSpace("\n");

    try {/*from  ww w .  jav  a  2  s  . c  om*/
        writer.add(newLine);

        while (reader.hasNext()) {
            final XMLEvent event = reader.nextEvent();

            if (event.getEventType() != XMLStreamConstants.START_DOCUMENT
                    && event.getEventType() != XMLStreamConstants.END_DOCUMENT
                    && event.getEventType() != XMLStreamConstants.COMMENT) {
                writer.add(event);
            }
        }
        writer.add(newLine);
    } finally {
        reader.close();
        IOUtils.closeQuietly(content);
    }
}

From source file:com.logiware.accounting.domain.EdiInvoice.java

private void createEcuLineInvoice(File file) throws Exception {
    InputStream inputStream = null;
    XMLEventReader eventReader = null;
    try {/*from w  ww  .  ja  v  a2  s.com*/
        XMLInputFactory inputFactory = XMLInputFactory.newInstance();
        inputStream = new FileInputStream(file);
        eventReader = inputFactory.createXMLEventReader(inputStream);
        while (eventReader.hasNext()) {
            XMLEvent event = eventReader.nextEvent();
            if (event.isStartElement()) {
                StartElement startElement = event.asStartElement();
                if ("Header".equalsIgnoreCase(startElement.getName().toString())) {
                    isHeader = true;
                    elements.add("Header");
                } else if ("Body".equalsIgnoreCase(startElement.getName().toString())) {
                    isBody = true;
                    elements.add("Body");
                } else if (isBody && "Information".equalsIgnoreCase(startElement.getName().toString())) {
                    isInformation = true;
                    elements.add("Information");
                } else if (isBody && !isInformation
                        && "Details".equalsIgnoreCase(startElement.getName().toString())) {
                    isDetails = true;
                    elements.add("Details");
                } else if (isBody && !isInformation && !isDetails
                        && "Summary".equalsIgnoreCase(startElement.getName().toString())) {
                    isSummary = true;
                    elements.add("Summary");
                } else if (null == elementType) {
                    setElementType(startElement);
                } else if (null != elementType && null == characterType) {
                    setCharacterType(startElement);
                }
            } else if (event.isCharacters()) {
                setValue(event.asCharacters());
            } else if (event.isEndElement()) {
                EndElement endElement = event.asEndElement();
                if (null != characterType && null != elementType) {
                    removeCharacterType();
                } else if (null != elementType) {
                    removeElementType(endElement);
                } else if (isSummary && "Summary".equalsIgnoreCase(endElement.getName().toString())) {
                    isSummary = false;
                } else if (isDetails && "Details".equalsIgnoreCase(endElement.getName().toString())) {
                    isDetails = false;
                } else if (isBody && "Information".equalsIgnoreCase(endElement.getName().toString())) {
                    isInformation = false;
                } else if ("Body".equalsIgnoreCase(endElement.getName().toString())) {
                    isBody = false;
                } else if ("Header".equalsIgnoreCase(endElement.getName().toString())) {
                    isHeader = false;
                }
            }
        }
        this.company = Company.ECU_LINE;
        status = new EdiInvoiceDAO().getStatus(vendorNumber, invoiceNumber);
        if (!elements.contains("Header")) {
            throw new AccountingException("Bad File. <Header> element missing");
        } else if (!elements.contains("Body")) {
            throw new AccountingException("Bad File. <Body> missing");
        } else if (!elements.contains("Information")) {
            throw new AccountingException("Bad File. <Information> element under <Body> missing");
        } else if (!elements.contains("Details")) {
            throw new AccountingException("Bad File. <Details> element under <Body> missing");
        } else if (!elements.contains("Summary")) {
            throw new AccountingException("Bad File. <Summary> element under <Body> missing");
        } else if (!elements.contains("Applicationreference")) {
            throw new AccountingException("Bad File. <Applicationreference> element under <Header> missing");
        } else if (!elements.contains("Reference")) {
            throw new AccountingException("Bad File. <Reference> element under <Header> missing");
        } else if (!elements.contains("Sender")) {
            throw new AccountingException("Bad File. <Sender> element under <Header> missing");
        } else if (!elements.contains("Code")) {
            throw new AccountingException("Bad File. <Code> element under <Sender> of <Header> missing");
        } else if (!elements.contains("Invoice")) {
            throw new AccountingException(
                    "Bad File. <Invoice> element under <Information> element of <Body> missing");
        } else if (!elements.contains("RelatedReferences")) {
            throw new AccountingException(
                    "Bad File. <RelatedReferences> element under <Information> element of <Body> missing");
        } else if (!elements.contains("BY")) {
            throw new AccountingException(
                    "Bad File. <Parties Qualifier=\"BY\"> under <Information> element of <Body> missing");
        } else if (!elements.contains("SU")) {
            throw new AccountingException(
                    "Bad File. <Parties Qualifier=\"SU\"> under <Information> element of <Body> missing");
        } else if (!elements.contains("PaymentTerms")) {
            throw new AccountingException(
                    "Bad File. <PaymentTerms> element under <Information> element of <Body> missing");
        } else if (!elements.contains("ShipmentInformation")) {
            throw new AccountingException(
                    "Bad File. <ShipmentInformation> element under <Information> element of <Body> missing");
        } else if (!elements.contains("Detail")) {
            throw new AccountingException(
                    "Bad File. <Detail> element under <Details> element of <Body> missing");
        } else if (!elements.contains("TotalMonetaryAmount")) {
            throw new AccountingException(
                    "Bad File. <TotalMonetaryAmount> element under <Summary> element of <Body> missing");
        } else if (!elements.contains("TotalMonetaryAmountGroupByVAT")) {
            throw new AccountingException(
                    "Bad File. <TotalMonetaryAmountGroupByVAT> element under <Summary> element of <Body> missing");
        }
    } catch (Exception e) {
        throw e;
    } finally {
        if (null != eventReader) {
            eventReader.close();
        }
        if (null != inputStream) {
            inputStream.close();
        }
    }
}