Example usage for javax.xml.stream XMLEventReader nextEvent

List of usage examples for javax.xml.stream XMLEventReader nextEvent

Introduction

In this page you can find the example usage for javax.xml.stream XMLEventReader nextEvent.

Prototype

public XMLEvent nextEvent() throws XMLStreamException;

Source Link

Document

Gets the next XMLEvent.

Usage

From source file:de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.java

public MultiValueHashMap<String, String> getPageTitle2Redirects(InputStream sWikipediaDump)
        throws FileNotFoundException, XMLStreamException {
    // <text xml:space="preserve">#REDIRECT [[Autopoiesis]]</text>
    // <text xml:space="preserve">#REDIRECT:[[Hans Leo Haler]]</text>
    // <text xml:space="preserve">#redirect [[Weier Hai]]</text>
    // #weiterleitung
    // <page>
    // <title>Autopoiesis</title>

    Logger.getLogger(WikipediaDumpParser.class.getName()).info("will collect redirects from wikipedia dump...");

    MultiValueHashMap<String, String> hsPageTitle2Redirects = new MultiValueBalancedTreeMap<String, String>();

    String strCurrentTitle = "";
    XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();

    XMLEventReader xmlEventReader = xmlInputFactory.createXMLEventReader(sWikipediaDump, "Utf-8");
    int iTitlesRead = 0;
    while (xmlEventReader.hasNext()) {
        XMLEvent xmlEvent = xmlEventReader.nextEvent();

        if (!xmlEvent.isStartElement())
            continue;
        // wenn wir einen Title haben, dann merken wir uns den, falls wir ihn brauchen
        if (xmlEvent.asStartElement().getName().getLocalPart().equals("title")) {
            strCurrentTitle = readNextCharEventsText(xmlEventReader);

            iTitlesRead++;/* ww  w  . j av  a 2  s .  co  m*/
            if (iTitlesRead % 200000 == 0)
                Logger.getLogger(WikipediaDumpParser.class.getName())
                        .info("read doc #" + StringUtils.beautifyNumber(iTitlesRead));

            continue;
        }

        if (!xmlEvent.asStartElement().getName().getLocalPart().equals("text"))
            continue;

        // jetzt haben wir ein text-tag. Wir schauen, ob jetzt ein redirect kommt
        // entweder kommt ein charEvent oder ein EndEvent. Leere Texte gibts wohl auch
        XMLEvent nextEvent = xmlEventReader.peek();

        if (!nextEvent.isCharacters())
            continue;

        String strCharEventData = readNextCharEventsText(xmlEventReader);
        if (strCharEventData == null)
            continue;

        strCharEventData = strCharEventData.trim();

        boolean bRedirect = false;

        if (strCharEventData.length() >= 9 && strCharEventData.substring(0, 9).equalsIgnoreCase("#redirect"))
            bRedirect = true;
        if (!bRedirect && strCharEventData.length() >= 8
                && strCharEventData.substring(0, 8).equalsIgnoreCase("redirect")
                && !strCharEventData.contains("\n"))
            bRedirect = true;
        if (!bRedirect && strCharEventData.length() >= 14
                && strCharEventData.substring(0, 14).equalsIgnoreCase("#weiterleitung"))
            bRedirect = true;
        if (!bRedirect && strCharEventData.length() >= 13
                && strCharEventData.substring(0, 13).equalsIgnoreCase("weiterleitung")
                && !strCharEventData.contains("\n"))
            bRedirect = true;

        if (!bRedirect)
            continue;

        // wir haben einen redirect - der wird in unsere Datenstruktur eingetragen
        int iStart = strCharEventData.indexOf("[[");
        int iEnd = strCharEventData.indexOf("]]");
        if (iStart < 0 || iEnd < 0)
            continue;
        if (iEnd <= iStart)
            continue;
        if ((iStart + 2) > strCharEventData.length() || iEnd > strCharEventData.length())
            continue;

        String strRedirectTarget = strCharEventData.substring(iStart + 2, iEnd).trim();
        hsPageTitle2Redirects.add(strRedirectTarget, strCurrentTitle);

        // if("Venceslav Konstantinov".equalsIgnoreCase(strCurrentTitle) || "Venceslav Konstantinov".equalsIgnoreCase(strRedirectTarget))
        // System.out.println("redirect found: (" + hsPageTitle2Redirects.keySize() + ") " + strCurrentTitle + " => '" + strRedirectTarget + "'");

    }

    Logger.getLogger(WikipediaDumpParser.class.getName())
            .info("Redirects found: " + StringUtils.beautifyNumber(hsPageTitle2Redirects.valueSize()));

    return hsPageTitle2Redirects;

}

From source file:json_to_xml_1.java

public int execute(String args[]) throws ProgramTerminationException {
    this.getInfoMessages().clear();

    if (args.length < 2) {
        throw constructTermination("messageArgumentsMissing", null,
                getI10nString("messageArgumentsMissingUsage") + "\n\tjson_to_xml_1 "
                        + getI10nString("messageParameterList") + "\n");
    }/*w  w w  . j a v  a2  s. c  o  m*/

    File resultInfoFile = new File(args[1]);

    try {
        resultInfoFile = resultInfoFile.getCanonicalFile();
    } catch (SecurityException ex) {
        throw constructTermination("messageResultInfoFileCantGetCanonicalPath", ex, null,
                resultInfoFile.getAbsolutePath());
    } catch (IOException ex) {
        throw constructTermination("messageResultInfoFileCantGetCanonicalPath", ex, null,
                resultInfoFile.getAbsolutePath());
    }

    if (resultInfoFile.exists() == true) {
        if (resultInfoFile.isFile() == true) {
            if (resultInfoFile.canWrite() != true) {
                throw constructTermination("messageResultInfoFileIsntWritable", null, null,
                        resultInfoFile.getAbsolutePath());
            }
        } else {
            throw constructTermination("messageResultInfoPathIsntAFile", null, null,
                    resultInfoFile.getAbsolutePath());
        }
    }

    json_to_xml_1.resultInfoFile = resultInfoFile;

    File jobFile = new File(args[0]);

    try {
        jobFile = jobFile.getCanonicalFile();
    } catch (SecurityException ex) {
        throw constructTermination("messageJobFileCantGetCanonicalPath", ex, null, jobFile.getAbsolutePath());
    } catch (IOException ex) {
        throw constructTermination("messageJobFileCantGetCanonicalPath", ex, null, jobFile.getAbsolutePath());
    }

    if (jobFile.exists() != true) {
        throw constructTermination("messageJobFileDoesntExist", null, null, jobFile.getAbsolutePath());
    }

    if (jobFile.isFile() != true) {
        throw constructTermination("messageJobPathIsntAFile", null, null, jobFile.getAbsolutePath());
    }

    if (jobFile.canRead() != true) {
        throw constructTermination("messageJobFileIsntReadable", null, null, jobFile.getAbsolutePath());
    }

    System.out.println("json_to_xml_1: " + getI10nStringFormatted("messageCallDetails",
            jobFile.getAbsolutePath(), resultInfoFile.getAbsolutePath()));

    File inputFile = null;
    File outputFile = null;

    try {
        XMLInputFactory inputFactory = XMLInputFactory.newInstance();
        InputStream in = new FileInputStream(jobFile);
        XMLEventReader eventReader = inputFactory.createXMLEventReader(in);

        while (eventReader.hasNext() == true) {
            XMLEvent event = eventReader.nextEvent();

            if (event.isStartElement() == true) {
                String tagName = event.asStartElement().getName().getLocalPart();

                if (tagName.equals("json-input-file") == true) {
                    StartElement inputFileElement = event.asStartElement();
                    Attribute pathAttribute = inputFileElement.getAttributeByName(new QName("path"));

                    if (pathAttribute == null) {
                        throw constructTermination("messageJobFileEntryIsMissingAnAttribute", null, null,
                                jobFile.getAbsolutePath(), tagName, "path");
                    }

                    String inputFilePath = pathAttribute.getValue();

                    if (inputFilePath.isEmpty() == true) {
                        throw constructTermination("messageJobFileAttributeValueIsEmpty", null, null,
                                jobFile.getAbsolutePath(), tagName, "path");
                    }

                    inputFile = new File(inputFilePath);

                    if (inputFile.isAbsolute() != true) {
                        inputFile = new File(
                                jobFile.getAbsoluteFile().getParent() + File.separator + inputFilePath);
                    }

                    try {
                        inputFile = inputFile.getCanonicalFile();
                    } catch (SecurityException ex) {
                        throw constructTermination("messageInputFileCantGetCanonicalPath", ex, null,
                                new File(inputFilePath).getAbsolutePath(), jobFile.getAbsolutePath());
                    } catch (IOException ex) {
                        throw constructTermination("messageInputFileCantGetCanonicalPath", ex, null,
                                new File(inputFilePath).getAbsolutePath(), jobFile.getAbsolutePath());
                    }

                    if (inputFile.exists() != true) {
                        throw constructTermination("messageInputFileDoesntExist", null, null,
                                inputFile.getAbsolutePath(), jobFile.getAbsolutePath());
                    }

                    if (inputFile.isFile() != true) {
                        throw constructTermination("messageInputPathIsntAFile", null, null,
                                inputFile.getAbsolutePath(), jobFile.getAbsolutePath());
                    }

                    if (inputFile.canRead() != true) {
                        throw constructTermination("messageInputFileIsntReadable", null, null,
                                inputFile.getAbsolutePath(), jobFile.getAbsolutePath());
                    }
                } else if (tagName.equals("xml-output-file") == true) {
                    StartElement outputFileElement = event.asStartElement();
                    Attribute pathAttribute = outputFileElement.getAttributeByName(new QName("path"));

                    if (pathAttribute == null) {
                        throw constructTermination("messageJobFileEntryIsMissingAnAttribute", null, null,
                                jobFile.getAbsolutePath(), tagName, "path");
                    }

                    String outputFilePath = pathAttribute.getValue();

                    if (outputFilePath.isEmpty() == true) {
                        throw constructTermination("messageJobFileAttributeValueIsEmpty", null, null,
                                jobFile.getAbsolutePath(), tagName, "path");
                    }

                    outputFile = new File(outputFilePath);

                    if (outputFile.isAbsolute() != true) {
                        outputFile = new File(
                                jobFile.getAbsoluteFile().getParent() + File.separator + outputFilePath);
                    }

                    try {
                        outputFile = outputFile.getCanonicalFile();
                    } catch (SecurityException ex) {
                        throw constructTermination("messageOutputFileCantGetCanonicalPath", ex, null,
                                new File(outputFilePath).getAbsolutePath(), jobFile.getAbsolutePath());
                    } catch (IOException ex) {
                        throw constructTermination("messageOutputFileCantGetCanonicalPath", ex, null,
                                new File(outputFilePath).getAbsolutePath(), jobFile.getAbsolutePath());
                    }

                    if (outputFile.exists() == true) {
                        if (outputFile.isFile() == true) {
                            if (outputFile.canWrite() != true) {
                                throw constructTermination("messageOutputFileIsntWritable", null, null,
                                        outputFile.getAbsolutePath());
                            }
                        } else {
                            throw constructTermination("messageOutputPathIsntAFile", null, null,
                                    outputFile.getAbsolutePath());
                        }
                    }
                }
            }
        }
    } catch (XMLStreamException ex) {
        throw constructTermination("messageJobFileErrorWhileReading", ex, null, jobFile.getAbsolutePath());
    } catch (SecurityException ex) {
        throw constructTermination("messageJobFileErrorWhileReading", ex, null, jobFile.getAbsolutePath());
    } catch (IOException ex) {
        throw constructTermination("messageJobFileErrorWhileReading", ex, null, jobFile.getAbsolutePath());
    }

    if (inputFile == null) {
        throw constructTermination("messageJobFileNoInputFile", null, null, jobFile.getAbsolutePath());
    }

    if (outputFile == null) {
        throw constructTermination("messageJobFileNoOutputFile", null, null, jobFile.getAbsolutePath());
    }

    StringBuilder stringBuilder = new StringBuilder();

    try {
        JSONObject json = new JSONObject(new JSONTokener(new BufferedReader(new FileReader(inputFile))));

        stringBuilder.append(XML.toString(json));
    } catch (Exception ex) {
        throw constructTermination("messageConversionError", ex, null, inputFile.getAbsolutePath());
    }

    try {
        BufferedWriter writer = new BufferedWriter(
                new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8"));

        writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
        writer.write(
                "<!-- This file was created by json_to_xml_1, which is free software licensed under the GNU Affero General Public License 3 or any later version (see https://github.com/publishing-systems/digital_publishing_workflow_tools/ and http://www.publishing-systems.org). -->\n");
        writer.write(stringBuilder.toString());

        writer.flush();
        writer.close();
    } catch (FileNotFoundException ex) {
        throw constructTermination("messageOutputFileWritingError", ex, null, outputFile.getAbsolutePath());
    } catch (UnsupportedEncodingException ex) {
        throw constructTermination("messageOutputFileWritingError", ex, null, outputFile.getAbsolutePath());
    } catch (IOException ex) {
        throw constructTermination("messageOutputFileWritingError", ex, null, outputFile.getAbsolutePath());
    }

    return 0;
}

From source file:sapience.injectors.stax.inject.StringBasedStaxStreamInjector.java

/**
 * If the reference is more then a simple attribute, we have to add new XML (subtree) to the stream. We transform
 * the reference into an InputStream and invoke another SAX parsing process for it. But the parsed events are added
 * to the main XMLEventWriter. /*w  w w . ja v  a 2 s. c  o m*/
 *
 * @param w
 * @param string
 * @throws XMLStreamException 
 * @throws XMLStreamException
 */
private void createEventsForElement(XMLEventWriter w, Reference ref) throws XMLStreamException {
    XMLEventReader r = null;
    try {
        StringBuilder target = new StringBuilder(ref.getTarget().toString());

        NamespaceContext c = w.getNamespaceContext();

        // process namespaces
        //processNamespace(target, w.getNamespaceContext());

        ByteArrayInputStream bais = new ByteArrayInputStream(target.toString().getBytes());
        this.inFac.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false);
        r = this.inFac.createXMLEventReader(bais);
        // start a new line

        while (r.hasNext()) {
            XMLEvent e = r.nextEvent();
            switch (e.getEventType()) {
            case XMLEvent.START_DOCUMENT:
                break;
            case XMLEvent.END_DOCUMENT:
                break;
            default:
                w.add(e);
                break;
            }
        }
    } finally {
        ;

        if (r != null)
            r.close();
    }

}

From source file:de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.java

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    try {/*  w w w  . j  av  a2s.c om*/

        // wir iterieren schn ber die page-Eintrge. Darin gibt es dann title, timestamp, <contributor> => <username> und text. den text mssen
        // wir noch bereinigen. dazu nehmen wir eine Vorverarbeitung mit bliki - dazu mssen wir aber selbst nochmal den String vorbereiten und
        // nachbereinigen. Leider.

        WikipediaDumpParserConfig wikipediaDumpParserConfig = context.get(WikipediaDumpParserConfig.class);

        if (wikipediaDumpParserConfig == null) {
            Logger.getLogger(WikipediaDumpParser.class.getName())
                    .info("No wikipedia parser config found. Will take the default one.");
            wikipediaDumpParserConfig = new WikipediaDumpParserConfig();
        }

        TikaInputStream tikaStream = TikaInputStream.get(stream);

        File fWikipediaDumpFile4Stream = tikaStream.getFile();

        MultiValueHashMap<String, String> hsPageTitle2Redirects = new MultiValueHashMap<String, String>();
        if (wikipediaDumpParserConfig.determinePageRedirects)
            hsPageTitle2Redirects = getPageTitle2Redirects(new FileInputStream(fWikipediaDumpFile4Stream));

        HashSet<String> hsRedirectPageTitles = new HashSet<String>(hsPageTitle2Redirects.values());

        String strCleanedText = "";
        String strBaseURL = null;

        XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
        XMLEventReader xmlEventReader = xmlInputFactory
                .createXMLEventReader(new FileInputStream(fWikipediaDumpFile4Stream), "Utf-8");
        while (xmlEventReader.hasNext()) {

            XMLEvent xmlEvent = xmlEventReader.nextEvent();

            if (xmlEvent.isEndElement() && xmlEvent.asEndElement().getName().getLocalPart().equals("page")) {
                if (metadata.size() == 0)
                    continue;

                // den mimetype wollen wir auch noch in den Metadaten haben
                metadata.add(Metadata.CONTENT_TYPE, "application/wikipedia+xml");

                XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
                xhtml.startDocument();

                xhtml.startElement("p");
                xhtml.characters(strCleanedText.toCharArray(), 0, strCleanedText.length());
                xhtml.endElement("p");

                xhtml.endDocument();

            }

            if (!xmlEvent.isStartElement())
                continue;

            // ##### die siteinfo

            if (strBaseURL == null && xmlEvent.asStartElement().getName().getLocalPart().equals("base")) {
                // http://de.wikipedia.org/wiki/Wikipedia:Hauptseite =>http://de.wikipedia.org/wiki/
                strBaseURL = readNextCharEventsText(xmlEventReader);
                strBaseURL = strBaseURL.substring(0, strBaseURL.lastIndexOf("/") + 1);
            }

            // ##### die page

            if (xmlEvent.asStartElement().getName().getLocalPart().equals("page")) {
                for (String strKey : metadata.names())
                    metadata.remove(strKey);
            }

            // ##### der Title

            if (xmlEvent.asStartElement().getName().getLocalPart().equals("title")) {
                // wir merken uns immer den aktuellen Titel
                String strCurrentTitle = readNextCharEventsText(xmlEventReader);

                if (strCurrentTitle.equalsIgnoreCase("DuckDuckGo")) {
                    int fasd = 8;
                }

                if (strCurrentTitle.toLowerCase().contains("duck")
                        && strCurrentTitle.toLowerCase().contains("go")) {
                    int is = 666;
                }

                // wenn der Titel eine redirect-Page ist, dann tragen wir die ganze Page aus der EventQueue aus, springen an das endPage, und
                // haben somit diese Seite ignoriert. Ferner ignorieren wir auch spezielle wikipedia-Seiten
                String strSmallTitle = strCurrentTitle.trim().toLowerCase();
                if (hsRedirectPageTitles.contains(strCurrentTitle)
                        || hsRedirectPageTitles.contains(strSmallTitle)
                        || hsRedirectPageTitles.contains(strCurrentTitle.trim())
                        || strSmallTitle.startsWith("category:") || strSmallTitle.startsWith("kategorie:")
                        || strSmallTitle.startsWith("vorlage:") || strSmallTitle.startsWith("template:")
                        || strSmallTitle.startsWith("hilfe:") || strSmallTitle.startsWith("help:")
                        || strSmallTitle.startsWith("wikipedia:") || strSmallTitle.startsWith("portal:")
                        || strSmallTitle.startsWith("mediawiki:")) {

                    while (true) {
                        XMLEvent nextXmlEvent = xmlEventReader.nextEvent();
                        if (nextXmlEvent.isEndElement()
                                && nextXmlEvent.asEndElement().getName().getLocalPart().equals("page"))
                            break;
                    }
                } else {
                    metadata.add(Metadata.TITLE, strCurrentTitle);
                    metadata.add(Metadata.SOURCE, strBaseURL + strCurrentTitle);

                    for (String strRedirect : hsPageTitle2Redirects.get(strCurrentTitle)) {
                        // wir ignorieren Titel, die sich lediglich durch gro/kleinschreibung unterscheiden
                        if (!StringUtils.containsIgnoreCase(strRedirect, metadata.getValues(Metadata.TITLE)))
                            metadata.add(Metadata.TITLE, strRedirect);
                    }
                }

                continue;
            }

            // ##### der text
            if (xmlEvent.asStartElement().getName().getLocalPart().equals("text")) {
                String strText = readNextCharEventsText(xmlEventReader);

                if (wikipediaDumpParserConfig.parseLinksAndCategories)
                    parseLinksAndCategories(strText, strBaseURL, metadata, handler);
                if (wikipediaDumpParserConfig.parseInfoBoxes)
                    parseInfoBox(strText, metadata, handler);
                if (wikipediaDumpParserConfig.parseGeoCoordinates)
                    parseGeoCoordinates(strText, metadata);

                // aufgrund einiger Defizite in dem verwendeten cleaner mssen wir hier leider noch zu-und nacharbeiten
                strText = strText.replaceAll("==\n", "==\n\n");
                strText = strText.replaceAll("\n==", "\n\n==");

                strCleanedText = m_wikiModel.render(new PlainTextConverter(), strText);

                strCleanedText = strCleanedText.replaceAll("\\{\\{", " ");
                strCleanedText = strCleanedText.replaceAll("\\}\\}", " ");

                strCleanedText = StringEscapeUtils.unescapeHtml4(strCleanedText);

                continue;
            }

            // ##### der timestamp
            if (xmlEvent.asStartElement().getName().getLocalPart().equals("timestamp")) {
                String strTimestamp = readNextCharEventsText(xmlEventReader);

                metadata.add(Metadata.MODIFIED, strTimestamp);

                continue;
            }

            // ##### der username
            if (xmlEvent.asStartElement().getName().getLocalPart().equals("username")) {
                String strUsername = readNextCharEventsText(xmlEventReader);

                metadata.add(Metadata.CREATOR, strUsername);

                continue;
            }

        }

    } catch (Exception e) {
        Logger.getLogger(WikipediaDumpParser.class.getName()).log(Level.SEVERE, "Error", e);
    }

}

From source file:ca.phon.session.io.xml.v12.XMLSessionReader_v12.java

@Override
public boolean canRead(File file) throws IOException {
    // open file and make sure the first
    // element is 'session' with the correct version
    boolean canRead = false;

    // use StAX to read only first element
    // create StAX reader
    XMLInputFactory factory = XMLInputFactory.newInstance();
    XMLEventReader reader = null;
    try (FileInputStream source = new FileInputStream(file)) {
        //BufferedReader in = new BufferedReader(new InputStreamReader(source, "UTF-8"));
        XMLEventReader xmlReader = factory.createXMLEventReader(source, "UTF-8");
        reader = factory.createFilteredReader(xmlReader, new XMLWhitespaceFilter());

        XMLEvent evt;//from www .  ja v  a 2 s. c o  m
        while (!(evt = reader.nextEvent()).isStartElement())
            ;
        canRead = evt.asStartElement().getName().getLocalPart().equals("session")
                && evt.asStartElement().getAttributeByName(new QName("version")).getValue().equals("PB1.2");
    } catch (XMLStreamException e) {
        throw new IOException(e);
    }

    return canRead;
}

From source file:com.msopentech.odatajclient.testservice.utils.XMLUtilities.java

private int countFeedElements(final InputStream is, final String elementName) throws XMLStreamException {
    final XMLEventReader reader = getEventReader(is);

    int count = 0;

    while (reader.hasNext()) {
        final XMLEvent event = reader.nextEvent();

        if (event.getEventType() == XMLStreamConstants.START_ELEMENT
                && elementName.equals(event.asStartElement().getName().getLocalPart())) {
            count++;//from www .j  av  a2 s  .co m
        }
    }

    reader.close();
    return count;
}

From source file:edu.jhu.hlt.concrete.ingesters.webposts.WebPostIngester.java

@Override
public Communication fromCharacterBasedFile(final Path path) throws IngestException {
    if (!Files.exists(path))
        throw new IngestException("No file at: " + path.toString());

    AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory();
    AnalyticUUIDGenerator g = f.create();
    Communication c = new Communication();
    c.setUuid(g.next());//from   w  w  w . ja v a  2 s  .co  m
    c.setType(this.getKind());
    c.setMetadata(TooledMetadataConverter.convert(this));

    try {
        ExistingNonDirectoryFile ef = new ExistingNonDirectoryFile(path);
        c.setId(ef.getName().split("\\.")[0]);
    } catch (NoSuchFileException | NotFileException e) {
        // might throw if path is a directory.
        throw new IngestException(path.toString() + " is not a file, or is a directory.");
    }

    String content;
    try (InputStream is = Files.newInputStream(path);
            BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);) {
        content = IOUtils.toString(bin, StandardCharsets.UTF_8);
        c.setText(content);
    } catch (IOException e) {
        throw new IngestException(e);
    }

    try (InputStream is = Files.newInputStream(path);
            BufferedInputStream bin = new BufferedInputStream(is, 1024 * 8 * 8);
            BufferedReader reader = new BufferedReader(new InputStreamReader(bin, StandardCharsets.UTF_8));) {
        XMLEventReader rdr = null;
        try {
            rdr = inF.createXMLEventReader(reader);

            // Below method moves the reader
            // to the headline end element.
            Section headline = this.handleBeginning(rdr, content, c);
            headline.setUuid(g.next());
            c.addToSectionList(headline);
            TextSpan sts = headline.getTextSpan();
            LOGGER.debug("headline text: {}", c.getText().substring(sts.getStart(), sts.getEnding()));

            int sectNumber = 1;
            int subSect = 0;

            int currOff = -1;
            // Big amounts of characters.
            while (rdr.hasNext()) {
                XMLEvent nextEvent = rdr.nextEvent();
                currOff = nextEvent.getLocation().getCharacterOffset();

                // First: see if document is going to end.
                // If yes: exit.
                if (nextEvent.isEndDocument())
                    break;

                // region
                // enables ingestion of quotes inside a usenet webpost.
                // by Tongfei Chen
                if (nextEvent.isStartElement()
                        && nextEvent.asStartElement().getName().equals(QName.valueOf("QUOTE"))) {
                    Attribute attrQuote = nextEvent.asStartElement()
                            .getAttributeByName(QName.valueOf("PREVIOUSPOST"));
                    String quote = StringEscapeUtils.escapeXml(attrQuote.getValue());
                    int location = attrQuote.getLocation().getCharacterOffset()
                            + "<QUOTE PREVIOUSPOST=\"".length();
                    Section quoteSection = new Section(g.next(), "quote")
                            .setTextSpan(new TextSpan(location, location + quote.length()));
                    c.addToSectionList(quoteSection);
                }
                // endregion

                // Check if start element.
                if (nextEvent.isCharacters()) {
                    Characters chars = nextEvent.asCharacters();
                    if (!chars.isWhiteSpace()) {
                        String fpContent = chars.getData();
                        LOGGER.debug("Character offset: {}", currOff);
                        LOGGER.debug("Character based data: {}", fpContent);

                        SimpleImmutableEntry<Integer, Integer> pads = trimSpacing(fpContent);
                        final int tsb = currOff + pads.getKey();

                        final int tse = currOff + fpContent.replace("\"", "&quot;").replace("<", "&lt;")
                                .replace(">", "&gt;").length() - (pads.getValue());
                        // MAINTAIN CORRECT TEXT SPAN
                        // CANNOT USE StringEscapeUtils.escapeXml because it will escape "'", which
                        // is not escaped in the data
                        // @tongfei

                        LOGGER.debug("Section text: {}", content.substring(tsb, tse));
                        TextSpan ts = new TextSpan(tsb, tse);
                        String sk;
                        if (subSect == 0)
                            sk = "poster";
                        else if (subSect == 1)
                            sk = "postdate";
                        else
                            sk = "post";

                        Section s = new Section();
                        s.setKind(sk);
                        s.setTextSpan(ts);
                        s.setUuid(g.next());
                        List<Integer> intList = new ArrayList<>();
                        intList.add(sectNumber);
                        intList.add(subSect);
                        s.setNumberList(intList);
                        c.addToSectionList(s);

                        subSect++;
                    }
                } else if (nextEvent.isEndElement()) {
                    EndElement ee = nextEvent.asEndElement();
                    currOff = ee.getLocation().getCharacterOffset();
                    QName name = ee.getName();
                    String localName = name.getLocalPart();
                    LOGGER.debug("Hit end element: {}", localName);
                    if (localName.equalsIgnoreCase(POST_LOCAL_NAME)) {
                        LOGGER.debug("Switching to new post.");
                        sectNumber++;
                        subSect = 0;
                    } else if (localName.equalsIgnoreCase(TEXT_LOCAL_NAME)) {
                        // done with document.
                        break;
                    }
                }
            }

            return c;

        } catch (XMLStreamException | ConcreteException | StringIndexOutOfBoundsException
                | ClassCastException x) {
            throw new IngestException(x);
        } finally {
            if (rdr != null)
                try {
                    rdr.close();
                } catch (XMLStreamException e) {
                    // not likely.
                    LOGGER.info("Error closing XMLReader.", e);
                }
        }
    } catch (IOException e) {
        throw new IngestException(e);
    }
}

From source file:com.evolveum.polygon.connector.hcm.DocumentProcessing.java

public Map<String, Object> parseXMLData(HcmConnectorConfiguration conf, ResultsHandler handler,
        Map<String, Object> schemaAttributeMap, Filter query) {

    XMLInputFactory factory = XMLInputFactory.newInstance();
    try {/*from w w w  .  j  a  va 2s  . c om*/

        String uidAttributeName = conf.getUidAttribute();
        String primariId = conf.getPrimaryId();
        String startName = "";
        String value = null;

        StringBuilder assignmentXMLBuilder = null;

        List<String> builderList = new ArrayList<String>();

        Integer nOfIterations = 0;
        Boolean isSubjectToQuery = false;
        Boolean isAssigment = false;
        Boolean evaluateAttr = true;
        Boolean specificAttributeQuery = false;

        XMLEventReader eventReader = factory.createXMLEventReader(new FileReader(conf.getFilePath()));
        List<String> dictionary = populateDictionary(FIRSTFLAG);

        if (!attrsToGet.isEmpty()) {

            attrsToGet.add(uidAttributeName);
            attrsToGet.add(primariId);
            specificAttributeQuery = true;
            evaluateAttr = false;
            LOGGER.ok("The uid and primary id were added to the queried attribute list");

            schemaAttributeMap = modifySchemaAttributeMap(schemaAttributeMap);
        }

        while (eventReader.hasNext()) {

            XMLEvent event = eventReader.nextEvent();

            Integer code = event.getEventType();

            if (code == XMLStreamConstants.START_ELEMENT) {

                StartElement startElement = event.asStartElement();
                startName = startElement.getName().getLocalPart();

                if (!evaluateAttr && attrsToGet.contains(startName)) {

                    evaluateAttr = true;
                }

                if (!elementIsEmployeeData) {

                    if (startName.equals(EMPLOYEES)) {

                        if (dictionary.contains(nOfIterations.toString())) {
                            LOGGER.ok("The defined number of iterations has been hit: {0}",
                                    nOfIterations.toString());
                            break;
                        } else {
                            startName = "";
                            elementIsEmployeeData = true;
                            nOfIterations++;
                        }
                    }
                } else if (evaluateAttr) {

                    if (!isAssigment) {
                        if (!ASSIGNMENTTAG.equals(startName)) {

                        } else {
                            assignmentXMLBuilder = new StringBuilder();
                            isAssigment = true;
                        }
                    } else {

                        builderList = processAssignment(startName, null, START, builderList);
                    }

                    if (multiValuedAttributesList.contains(startName)) {

                        elementIsMultiValued = true;
                    }

                }

            } else if (elementIsEmployeeData) {

                if (code == XMLStreamConstants.CHARACTERS && evaluateAttr) {

                    Characters characters = event.asCharacters();

                    if (!characters.isWhiteSpace()) {

                        StringBuilder valueBuilder;
                        if (value != null) {
                            valueBuilder = new StringBuilder(value).append("")
                                    .append(characters.getData().toString());
                        } else {
                            valueBuilder = new StringBuilder(characters.getData().toString());
                        }
                        value = valueBuilder.toString();
                        // value = StringEscapeUtils.escapeXml10(value);
                        // LOGGER.info("The attribute value for: {0} is
                        // {1}", startName, value);
                    }
                } else if (code == XMLStreamConstants.END_ELEMENT) {

                    EndElement endElement = event.asEndElement();
                    String endName = endElement.getName().getLocalPart();

                    isSubjectToQuery = checkFilter(endName, value, query, uidAttributeName);

                    if (!isSubjectToQuery) {
                        attributeMap.clear();
                        elementIsEmployeeData = false;
                        value = null;

                        endName = EMPLOYEES;
                    }

                    if (endName.equals(EMPLOYEES)) {

                        attributeMap = handleEmployeeData(attributeMap, schemaAttributeMap, handler,
                                uidAttributeName, primariId);

                        elementIsEmployeeData = false;

                    } else if (evaluateAttr) {

                        if (endName.equals(startName)) {
                            if (value != null) {

                                if (!isAssigment) {
                                    if (!elementIsMultiValued) {

                                        attributeMap.put(startName, value);
                                    } else {

                                        multiValuedAttributeBuffer.put(startName, value);
                                    }
                                } else {

                                    value = StringEscapeUtils.escapeXml10(value);
                                    builderList = processAssignment(endName, value, VALUE, builderList);

                                    builderList = processAssignment(endName, null, END, builderList);
                                }
                                // LOGGER.info("Attribute name: {0} and the
                                // Attribute value: {1}", endName, value);
                                value = null;
                            }
                        } else {
                            if (endName.equals(ASSIGNMENTTAG)) {

                                builderList = processAssignment(endName, null, CLOSE, builderList);

                                // if (assigmentIsActive) {

                                for (String records : builderList) {
                                    assignmentXMLBuilder.append(records);

                                }
                                attributeMap.put(ASSIGNMENTTAG, assignmentXMLBuilder.toString());
                                // } else {
                                // }

                                builderList = new ArrayList<String>();
                                // assigmentIsActive = false;
                                isAssigment = false;

                            } else if (multiValuedAttributesList.contains(endName)) {
                                processMultiValuedAttributes(multiValuedAttributeBuffer);
                            }
                        }

                    }
                    if (specificAttributeQuery && evaluateAttr) {

                        evaluateAttr = false;
                    }
                }
            } else if (code == XMLStreamConstants.END_DOCUMENT) {
                handleBufferedData(uidAttributeName, primariId, handler);
            }
        }

    } catch (FileNotFoundException e) {
        StringBuilder errorBuilder = new StringBuilder("File not found at the specified path.")
                .append(e.getLocalizedMessage());
        LOGGER.error("File not found at the specified path: {0}", e);
        throw new ConnectorIOException(errorBuilder.toString());
    } catch (XMLStreamException e) {

        LOGGER.error("Unexpected processing error while parsing the .xml document : {0}", e);

        StringBuilder errorBuilder = new StringBuilder(
                "Unexpected processing error while parsing the .xml document. ")
                        .append(e.getLocalizedMessage());

        throw new ConnectorIOException(errorBuilder.toString());
    }
    return attributeMap;

}

From source file:com.msopentech.odatajclient.testservice.utils.XMLUtilities.java

private void addAtomElement(final InputStream content, final XMLEventWriter writer) throws Exception {
    final XMLEventReader reader = getEventReader(content);

    final XMLEventFactory eventFactory = XMLEventFactory.newInstance();
    XMLEvent newLine = eventFactory.createSpace("\n");

    try {/*from  ww w .  jav  a  2  s  . c  om*/
        writer.add(newLine);

        while (reader.hasNext()) {
            final XMLEvent event = reader.nextEvent();

            if (event.getEventType() != XMLStreamConstants.START_DOCUMENT
                    && event.getEventType() != XMLStreamConstants.END_DOCUMENT
                    && event.getEventType() != XMLStreamConstants.COMMENT) {
                writer.add(event);
            }
        }
        writer.add(newLine);
    } finally {
        reader.close();
        IOUtils.closeQuietly(content);
    }
}

From source file:com.logiware.accounting.domain.EdiInvoice.java

private void createEcuLineInvoice(File file) throws Exception {
    InputStream inputStream = null;
    XMLEventReader eventReader = null;
    try {/*from w  ww  .  ja  v  a2  s.com*/
        XMLInputFactory inputFactory = XMLInputFactory.newInstance();
        inputStream = new FileInputStream(file);
        eventReader = inputFactory.createXMLEventReader(inputStream);
        while (eventReader.hasNext()) {
            XMLEvent event = eventReader.nextEvent();
            if (event.isStartElement()) {
                StartElement startElement = event.asStartElement();
                if ("Header".equalsIgnoreCase(startElement.getName().toString())) {
                    isHeader = true;
                    elements.add("Header");
                } else if ("Body".equalsIgnoreCase(startElement.getName().toString())) {
                    isBody = true;
                    elements.add("Body");
                } else if (isBody && "Information".equalsIgnoreCase(startElement.getName().toString())) {
                    isInformation = true;
                    elements.add("Information");
                } else if (isBody && !isInformation
                        && "Details".equalsIgnoreCase(startElement.getName().toString())) {
                    isDetails = true;
                    elements.add("Details");
                } else if (isBody && !isInformation && !isDetails
                        && "Summary".equalsIgnoreCase(startElement.getName().toString())) {
                    isSummary = true;
                    elements.add("Summary");
                } else if (null == elementType) {
                    setElementType(startElement);
                } else if (null != elementType && null == characterType) {
                    setCharacterType(startElement);
                }
            } else if (event.isCharacters()) {
                setValue(event.asCharacters());
            } else if (event.isEndElement()) {
                EndElement endElement = event.asEndElement();
                if (null != characterType && null != elementType) {
                    removeCharacterType();
                } else if (null != elementType) {
                    removeElementType(endElement);
                } else if (isSummary && "Summary".equalsIgnoreCase(endElement.getName().toString())) {
                    isSummary = false;
                } else if (isDetails && "Details".equalsIgnoreCase(endElement.getName().toString())) {
                    isDetails = false;
                } else if (isBody && "Information".equalsIgnoreCase(endElement.getName().toString())) {
                    isInformation = false;
                } else if ("Body".equalsIgnoreCase(endElement.getName().toString())) {
                    isBody = false;
                } else if ("Header".equalsIgnoreCase(endElement.getName().toString())) {
                    isHeader = false;
                }
            }
        }
        this.company = Company.ECU_LINE;
        status = new EdiInvoiceDAO().getStatus(vendorNumber, invoiceNumber);
        if (!elements.contains("Header")) {
            throw new AccountingException("Bad File. <Header> element missing");
        } else if (!elements.contains("Body")) {
            throw new AccountingException("Bad File. <Body> missing");
        } else if (!elements.contains("Information")) {
            throw new AccountingException("Bad File. <Information> element under <Body> missing");
        } else if (!elements.contains("Details")) {
            throw new AccountingException("Bad File. <Details> element under <Body> missing");
        } else if (!elements.contains("Summary")) {
            throw new AccountingException("Bad File. <Summary> element under <Body> missing");
        } else if (!elements.contains("Applicationreference")) {
            throw new AccountingException("Bad File. <Applicationreference> element under <Header> missing");
        } else if (!elements.contains("Reference")) {
            throw new AccountingException("Bad File. <Reference> element under <Header> missing");
        } else if (!elements.contains("Sender")) {
            throw new AccountingException("Bad File. <Sender> element under <Header> missing");
        } else if (!elements.contains("Code")) {
            throw new AccountingException("Bad File. <Code> element under <Sender> of <Header> missing");
        } else if (!elements.contains("Invoice")) {
            throw new AccountingException(
                    "Bad File. <Invoice> element under <Information> element of <Body> missing");
        } else if (!elements.contains("RelatedReferences")) {
            throw new AccountingException(
                    "Bad File. <RelatedReferences> element under <Information> element of <Body> missing");
        } else if (!elements.contains("BY")) {
            throw new AccountingException(
                    "Bad File. <Parties Qualifier=\"BY\"> under <Information> element of <Body> missing");
        } else if (!elements.contains("SU")) {
            throw new AccountingException(
                    "Bad File. <Parties Qualifier=\"SU\"> under <Information> element of <Body> missing");
        } else if (!elements.contains("PaymentTerms")) {
            throw new AccountingException(
                    "Bad File. <PaymentTerms> element under <Information> element of <Body> missing");
        } else if (!elements.contains("ShipmentInformation")) {
            throw new AccountingException(
                    "Bad File. <ShipmentInformation> element under <Information> element of <Body> missing");
        } else if (!elements.contains("Detail")) {
            throw new AccountingException(
                    "Bad File. <Detail> element under <Details> element of <Body> missing");
        } else if (!elements.contains("TotalMonetaryAmount")) {
            throw new AccountingException(
                    "Bad File. <TotalMonetaryAmount> element under <Summary> element of <Body> missing");
        } else if (!elements.contains("TotalMonetaryAmountGroupByVAT")) {
            throw new AccountingException(
                    "Bad File. <TotalMonetaryAmountGroupByVAT> element under <Summary> element of <Body> missing");
        }
    } catch (Exception e) {
        throw e;
    } finally {
        if (null != eventReader) {
            eventReader.close();
        }
        if (null != inputStream) {
            inputStream.close();
        }
    }
}