Example usage for javax.xml.stream XMLInputFactory setProperty

Introduction

In this page you can find the example usage for javax.xml.stream XMLInputFactory setProperty.

Prototype

public abstract void setProperty(java.lang.String name, Object value) throws java.lang.IllegalArgumentException;

Source Link

Document

Allows the user to set specific feature/property on the underlying implementation.

Usage

From source file:edu.unc.lib.dl.util.TripleStoreQueryServiceMulgaraImpl.java

/**
 * @param query//from  w w  w .j av a 2s .  c  om
 *            an ITQL command
 * @return the message returned by Mulgara
 * @throws RemoteException
 *             for communication failure
 */
public String storeCommand(String query) {
    String result = null;
    String response = this.sendTQL(query);
    if (response != null) {
        XMLInputFactory factory = XMLInputFactory.newInstance();
        factory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
        try (StringReader sr = new StringReader(response)) {
            XMLEventReader r = factory.createXMLEventReader(sr);
            boolean inMessage = false;
            StringBuffer message = new StringBuffer();
            while (r.hasNext()) {
                XMLEvent e = r.nextEvent();
                if (e.isStartElement()) {
                    StartElement s = e.asStartElement();
                    if ("message".equals(s.getName().getLocalPart())) {
                        inMessage = true;
                    }
                } else if (e.isEndElement()) {
                    EndElement end = e.asEndElement();
                    if ("message".equals(end.getName().getLocalPart())) {
                        inMessage = false;
                    }
                } else if (inMessage && e.isCharacters()) {
                    message.append(e.asCharacters().getData());
                }
            }
            r.close();
            result = message.toString();
        } catch (XMLStreamException e) {
            e.printStackTrace();
        }
    }
    return result;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.file.FileHarvester.java

private void parse(InfiniteFile f, SourcePojo source) throws MalformedURLException, URISyntaxException {

    //NOTE: we only ever break out of here because of max docs in standalone mode
    // (because we don't know how to continue reading)

    DocumentPojo doc = null;/*w  ww .  j  a v  a  2s . c o m*/
    //Determine File Extension
    String fileName = f.getName().toString();

    int mid = fileName.lastIndexOf(".");
    String extension = fileName.substring(mid + 1, fileName.length());

    //Checked to save processing time
    long fileTimestamp = (f.getDate() / 1000) * 1000;
    // (ensure truncated to seconds, since some operation somewhere hear does this...)

    Date modDate = new Date(fileTimestamp);
    //XML Data gets placed into MetaData

    boolean bIsXml = false;
    boolean bIsJson = false;
    boolean bIsLineOriented = false;
    if ((null != source.getFileConfig()) && (null != source.getFileConfig().type)) {
        extension = source.getFileConfig().type;
    }
    bIsXml = extension.equalsIgnoreCase("xml");
    bIsJson = extension.equalsIgnoreCase("json");
    bIsLineOriented = extension.endsWith("sv");

    if (bIsXml || bIsJson || bIsLineOriented) {
        int debugMaxDocs = Integer.MAX_VALUE; // by default don't set this, it's only for debug mode
        if (_context.isStandalone()) { // debug mode
            debugMaxDocs = maxDocsPerCycle;
        }
        //fast check to see if the file has changed before processing (or if it never existed)
        if (needsUpdated_SourceUrl(modDate, f.getUrlString(), source)) {
            if (0 != modDate.getTime()) { // if it ==0 then sourceUrl doesn't exist at all, no need to delete
                // This file already exists - in normal/managed mode will re-create
                // In streaming mode, simple skip over
                if (_streaming) {
                    return;
                } //TESTED

                DocumentPojo docRepresentingSrcUrl = new DocumentPojo();
                docRepresentingSrcUrl.setSourceUrl(f.getUrlString());
                docRepresentingSrcUrl.setSourceKey(source.getKey());
                docRepresentingSrcUrl.setCommunityId(source.getCommunityIds().iterator().next());
                sourceUrlsGettingUpdated.add(docRepresentingSrcUrl.getSourceUrl());
                this.docsToRemove.add(docRepresentingSrcUrl);
                // (can add documents with just source URL, are treated differently in the core libraries)               
            }

            SourceFileConfigPojo fileSystem = source.getFileConfig();
            if ((null == fileSystem) && (bIsXml || bIsJson)) {
                fileSystem = new SourceFileConfigPojo();
            }
            XmlToMetadataParser xmlParser = null;
            JsonToMetadataParser jsonParser = null;
            String urlType = extension;
            if (bIsXml) {
                xmlParser = new XmlToMetadataParser(fileSystem.XmlRootLevelValues, fileSystem.XmlIgnoreValues,
                        fileSystem.XmlSourceName, fileSystem.XmlPrimaryKey, fileSystem.XmlAttributePrefix,
                        fileSystem.XmlPreserveCase, debugMaxDocs);
            } //TESTED
            else if (bIsJson) {
                jsonParser = new JsonToMetadataParser(fileSystem.XmlSourceName, fileSystem.XmlRootLevelValues,
                        fileSystem.XmlPrimaryKey, fileSystem.XmlIgnoreValues, debugMaxDocs);
            } //TESTED

            List<DocumentPojo> partials = null;
            try {
                if (bIsXml) {
                    XMLStreamReader xmlStreamReader = null;
                    XMLInputFactory factory = XMLInputFactory.newInstance();
                    factory.setProperty(XMLInputFactory.IS_COALESCING, true);
                    factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
                    try {
                        xmlStreamReader = factory.createXMLStreamReader(f.getInputStream());
                        partials = xmlParser.parseDocument(xmlStreamReader);
                        long memUsage = xmlParser.getMemUsage();
                        _memUsage += memUsage;
                        _totalMemUsage.addAndGet(memUsage);
                    } finally {
                        if (null != xmlStreamReader)
                            xmlStreamReader.close();
                    }
                } //TESTED
                else if (bIsJson) {
                    JsonReader jsonReader = null;
                    try {
                        jsonReader = new JsonReader(new InputStreamReader(f.getInputStream(), "UTF-8"));
                        jsonReader.setLenient(true);
                        partials = jsonParser.parseDocument(jsonReader);
                        long memUsage = jsonParser.getMemUsage();
                        _memUsage += memUsage;
                        _totalMemUsage.addAndGet(memUsage);
                    } finally {
                        if (null != jsonReader)
                            jsonReader.close();
                    }
                } //TESTED
                else if (bIsLineOriented) { // Just generate a document for every line

                    BufferedReader lineReader = null;
                    try {
                        lineReader = new BufferedReader(new InputStreamReader(f.getInputStream(), "UTF-8"));
                        CsvToMetadataParser lineParser = new CsvToMetadataParser(debugMaxDocs);
                        partials = lineParser.parseDocument(lineReader, source);
                        long memUsage = lineParser.getMemUsage();
                        _memUsage += memUsage;
                        _totalMemUsage.addAndGet(memUsage);
                    } finally {
                        if (null != lineReader)
                            lineReader.close();
                    }
                } //TESTED

                MessageDigest md5 = null; // (generates unique urls if the user doesn't below)
                try {
                    md5 = MessageDigest.getInstance("MD5");
                } catch (NoSuchAlgorithmException e) {
                    // Do nothing, unlikely to happen...
                }
                int nIndex = 0;
                int numPartials = partials.size();
                for (DocumentPojo doctoAdd : partials) {
                    nIndex++;
                    doctoAdd.setSource(source.getTitle());
                    doctoAdd.setSourceKey(source.getKey());
                    doctoAdd.setMediaType(source.getMediaType());
                    doctoAdd.setModified(new Date(fileTimestamp));
                    doctoAdd.setCreated(new Date());

                    if (null == doctoAdd.getUrl()) { // Can be set in the parser or here
                        doctoAdd.setHasDefaultUrl(true); // (ie cannot occur in a different src URL)

                        if (1 == numPartials) {
                            String urlString = f.getUrlString();
                            if (urlString.endsWith(urlType)) {
                                doctoAdd.setUrl(urlString);
                            } else {
                                doctoAdd.setUrl(
                                        new StringBuffer(urlString).append('.').append(urlType).toString());
                            }
                            // (we always set sourceUrl as the true url of the file, so want to differentiate the URL with
                            //  some useful information)
                        } else if (null == doctoAdd.getMetadata()) { // Line oriented case
                            doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/").append(nIndex)
                                    .append('.').append(urlType).toString());
                        } else {
                            if (null == md5) { // Will never happen, MD5 always exists
                                doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/")
                                        .append(doctoAdd.getMetadata().hashCode()).append('.').append(urlType)
                                        .toString());
                            } else { // This is the standard call if the XML parser has not been configured to build the URL
                                doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/")
                                        .append(DigestUtils.md5Hex(doctoAdd.getMetadata().toString()))
                                        .append('.').append(urlType).toString());
                            }
                        } //TESTED
                    }
                    doctoAdd.setTitle(f.getName().toString());
                    doctoAdd.setPublishedDate(new Date(fileTimestamp));
                    doctoAdd.setSourceUrl(f.getUrlString());

                    // Always add to files because I'm deleting the source URL
                    files.add(doctoAdd);
                } //TESTED 

            } catch (XMLStreamException e1) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
            } catch (FactoryConfigurationError e1) {
                errors++;
                _context.getHarvestStatus().logMessage(e1.getMessage(), true);

            } catch (IOException e1) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
            } catch (Exception e1) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
            }
        } //(end if needs updated)
    } else //Tika supports Excel,Word,Powerpoint,Visio, & Outlook Documents
    {
        // (This dedup tells me if it's an add/update vs ignore - qr.isDuplicate higher up tells me if I need to add or update)
        if (needsUpdated_Url(modDate, f.getUrlString(), source)) {

            Metadata metadata = null;
            InputStream in = null;
            try {

                doc = new DocumentPojo();

                // Create a tika object (first time only)
                if (null == _tika) {
                    this.initializeTika(_context, source);
                }

                // BUGGERY
                // NEED TO LIKELY SET LIMIT TO BE 30MB or 50MB and BYPASS ANYTHING OVER THAT BELOW IS THE CODE TO DO THAT
                // tika.setMaxStringLength(30*1024*1024);
                // Disable the string length limit
                _tika.setMaxStringLength(-1);
                //input = new FileInputStream(new File(resourceLocation));
                // Create a metadata object to contain the metadata

                metadata = new Metadata();
                // Parse the file and get the text of the file
                doc.setSource(source.getTitle());
                doc.setSourceKey(source.getKey());
                doc.setMediaType(source.getMediaType());
                String fullText = "";

                in = f.getInputStream();
                try {
                    if (null == _tikaOutputFormat) { // text only
                        fullText = _tika.parseToString(in, metadata);
                    } //TESTED
                    else { // XML/HMTL
                        _tika.getParser().parse(in, _tikaOutputFormat, metadata, _tikaOutputParseContext);
                        fullText = _tikaXmlFormatWriter.toString();
                        _tikaXmlFormatWriter.getBuffer().setLength(0);
                    } //TESTED
                } finally {
                    if (null != in)
                        in.close();
                }
                int descCap = 500;
                doc.setFullText(fullText);
                if (descCap > fullText.length()) {
                    descCap = fullText.length();
                }
                doc.setDescription(fullText.substring(0, descCap));
                doc.setModified(new Date(fileTimestamp));
                doc.setCreated(new Date());
                doc.setUrl(f.getUrlString());
                doc.setTitle(f.getName().toString());
                doc.setPublishedDate(new Date(fileTimestamp));

                long memUsage = (250L * (doc.getFullText().length() + doc.getDescription().length())) / 100L; // 25% overhead, 2x for string->byte
                _memUsage += memUsage;
                _totalMemUsage.addAndGet(memUsage);

                // If the metadata contains a more plausible date then use that
                try {
                    String title = metadata.get(Metadata.TITLE);
                    if (null != title) {
                        doc.setTitle(title);
                    }
                } catch (Exception e) { // Fine just carry on                  
                }
                try {
                    Date date = metadata.getDate(Metadata.CREATION_DATE); // MS Word
                    if (null != date) {
                        doc.setPublishedDate(date);
                    } else {
                        date = metadata.getDate(Metadata.DATE); // Dublin
                        if (null != date) {
                            doc.setPublishedDate(date);
                        } else {
                            date = metadata.getDate(Metadata.ORIGINAL_DATE);
                            if (null != date) {
                                doc.setPublishedDate(date);
                            }
                        }
                    }
                } catch (Exception e) { // Fine just carry on                  
                }
                //TESTED

                // If the metadata contains a geotag then apply that:
                try {
                    String lat = metadata.get(Metadata.LATITUDE);
                    String lon = metadata.get(Metadata.LONGITUDE);
                    if ((null != lat) && (null != lon)) {
                        GeoPojo gt = new GeoPojo();
                        gt.lat = Double.parseDouble(lat);
                        gt.lon = Double.parseDouble(lon);
                        doc.setDocGeo(gt);
                    }
                } catch (Exception e) { // Fine just carry on                  
                }

                // Save the entire metadata:
                doc.addToMetadata("_FILE_METADATA_", metadata);

                for (ObjectId communityId : source.getCommunityIds()) {
                    doc.setCommunityId(communityId);
                }
                files.add(doc);

                // Close the input stream
                in.close();
                in = null;

                //TESTED

            } catch (SmbException e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } catch (MalformedURLException e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } catch (UnknownHostException e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } catch (IOException e) {
                errors++;
                _context.getHarvestStatus().logMessage(e.getMessage(), true);
            } catch (TikaException e) {
                errors++;
                _context.getHarvestStatus().logMessage(e.getMessage(), true);
            } catch (Exception e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } finally { // Close the input stream if an error occurs
                if (null != in) {
                    try {
                        in.close();
                    } catch (IOException e) {
                        // All good, do nothing
                    }
                }
            } // end exception handling
        } // end dedup check
    } // end XML vs "office" app

    //DEBUG
    //System.out.println("FILE=" + files.size() + " / MEM=" + _memUsage + " VS " + Runtime.getRuntime().totalMemory());
}

From source file:com.clustercontrol.agent.winevent.WinEventMonitor.java

/**
 * XMLStAX???EventLogRecord????/*  w w  w.  j a v  a2s .co m*/
 * @param eventXmlStream
 * @return EventLogRecord?
 */
private ArrayList<EventLogRecord> parseEventXML(InputStream eventXmlStream) {
    ArrayList<EventLogRecord> eventlogs = new ArrayList<EventLogRecord>();

    try {
        XMLInputFactory xmlif = XMLInputFactory.newInstance();
        /**
         * OpenJDK7/OracleJDK7??"]"?2?????????????????????????????
         * ?XML?????????OpenJDK7/OracleJDK7???????/??????????
         * URL???????????????
         * 
         * URL
         * http://docs.oracle.com/javase/jp/6/api/javax/xml/stream/XMLStreamReader.html#next()
         */
        String xmlCoalescingKey = "javax.xml.stream.isCoalescing";// TODO JRE???????????????????
        if (m_log.isDebugEnabled()) {
            m_log.debug(xmlCoalescingKey + " = true");
        }
        xmlif.setProperty(xmlCoalescingKey, true);
        XMLStreamReader xmlr = xmlif.createXMLStreamReader(eventXmlStream);

        while (xmlr.hasNext()) {
            switch (xmlr.getEventType()) {
            case XMLStreamConstants.START_ELEMENT:
                m_log.trace("EventType : XMLStreamConstants.START_ELEMENT");

                String localName = xmlr.getLocalName();
                m_log.trace("local name : " + localName);

                if ("Event".equals(localName)) {
                    EventLogRecord eventlog = new EventLogRecord();
                    eventlogs.add(eventlog);
                    m_log.debug("create new EventLogRecord");
                } else {
                    String attrLocalName = null;
                    String attrValue = null;

                    if (xmlr.getAttributeCount() != 0) {
                        attrLocalName = xmlr.getAttributeLocalName(0);
                        attrValue = xmlr.getAttributeValue(0);
                        m_log.trace("attribute local name : " + attrLocalName);
                        m_log.trace("attribute local value : " + attrValue);
                    }

                    if ("Provider".equals(localName)) {
                        if ("Name".equals(attrLocalName)) {
                            m_log.trace("target value : " + attrValue);

                            EventLogRecord eventlog = eventlogs.get(eventlogs.size() - 1);
                            eventlog.setProviderName(attrValue);
                            m_log.debug("set ProviderName : " + eventlog.getProviderName());
                        }
                    }
                    // Get-WinEvent/wevtutil.exe
                    else if ("TimeCreated".equals(localName) && "SystemTime".equals(attrLocalName)) {
                        m_log.trace("target value : " + attrValue);

                        // "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS'Z'"???S????????????
                        String formatedDateString = attrValue.replaceAll("\\..*Z", "");
                        m_log.trace("formatted target value : " + formatedDateString);
                        DateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
                        sdf.setTimeZone(TimeZone.getTimeZone("UTC"));

                        EventLogRecord eventlog = eventlogs.get(eventlogs.size() - 1);
                        ;
                        try {
                            eventlog.setTimeCreated(sdf.parse(formatedDateString));
                        } catch (ParseException e) {
                            // do nothing
                            m_log.error("set TimeCreated Error", e);
                        }
                        m_log.debug("set TimeCreated : " + eventlog.getTimeCreated());
                    }
                    // Get-EventLog
                    if ("TimeGenerated".equals(localName) && "SystemTime".equals(attrLocalName)) {
                        m_log.trace("target value : " + attrValue);
                        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss'Z'");
                        sdf.setTimeZone(HinemosTime.getTimeZone());

                        EventLogRecord eventlog = eventlogs.get(eventlogs.size() - 1);
                        ;
                        try {
                            eventlog.setTimeCreated(sdf.parse(attrValue));
                        } catch (ParseException e) {
                            // do nothing
                            m_log.error("set TimeCreated Error", e);
                        }
                        m_log.debug("set TimeCreated : " + eventlog.getTimeCreated());
                    } else {
                        targetProperty = localName;
                        m_log.trace("target property : " + targetProperty);
                    }
                }

                break;
            case XMLStreamConstants.SPACE:
            case XMLStreamConstants.CHARACTERS:
                m_log.trace("EventType : XMLStreamConstants.CHARACTERS, length=" + xmlr.getTextLength());
                if (targetProperty != null) {
                    try {
                        EventLogRecord eventlog = eventlogs.get(eventlogs.size() - 1);
                        ;
                        if ("EventID".equals(targetProperty)) {
                            eventlog.setId(Integer.parseInt(new String(xmlr.getTextCharacters(),
                                    xmlr.getTextStart(), xmlr.getTextLength())));
                            m_log.debug("set EventID : " + eventlog.getId());
                        }
                        // Get-WinEvent/wevtutil.exe
                        else if ("Level".equals(targetProperty)) {
                            if (eventlog.getLevel() == WinEventConstant.UNDEFINED) {
                                eventlog.setLevel(Integer.parseInt(new String(xmlr.getTextCharacters(),
                                        xmlr.getTextStart(), xmlr.getTextLength())));
                                m_log.debug("set Level : " + eventlog.getLevel());
                            }
                        } else if ("Task".equals(targetProperty)) {
                            if (eventlog.getTask() == WinEventConstant.UNDEFINED) {
                                eventlog.setTask(Integer.parseInt(new String(xmlr.getTextCharacters(),
                                        xmlr.getTextStart(), xmlr.getTextLength())));
                                m_log.debug("set Task : " + eventlog.getTask());
                            }
                        } else if ("Keywords".equals(targetProperty)) {
                            // TODO ????????0x8080000000000000
                            //eventlog.setKeywords(Long.decode(new String(xmlr.getTextCharacters(), xmlr.getTextStart(), xmlr.getTextLength())));
                            //m_log.debug("set Keywords : " + eventlog.getKeywords());
                        } else if ("EventRecordId".equals(targetProperty)) {
                            eventlog.setRecordId(Long.parseLong(new String(xmlr.getTextCharacters(),
                                    xmlr.getTextStart(), xmlr.getTextLength())));
                            m_log.debug("set RecordId : " + eventlog.getRecordId());
                        } else if ("Channel".equals(targetProperty)) {
                            eventlog.setLogName(new String(xmlr.getTextCharacters(), xmlr.getTextStart(),
                                    xmlr.getTextLength()));
                            m_log.debug("set LogName : " + eventlog.getLogName());
                        } else if ("Computer".equals(targetProperty)) {
                            eventlog.setMachineName(new String(xmlr.getTextCharacters(), xmlr.getTextStart(),
                                    xmlr.getTextLength()));
                            m_log.debug("set MachineName : " + eventlog.getMachineName());
                        } else if ("Message".equals(targetProperty)) {
                            String message = new String(xmlr.getTextCharacters(), xmlr.getTextStart(),
                                    xmlr.getTextLength());
                            message = message.replaceAll(tmpReturnCode, "\r\n");
                            message = message.replaceAll(tmpLtCode, "<");
                            message = message.replaceAll(tmpGtCode, ">");
                            eventlog.setMessage(message);
                            m_log.debug("set Message : " + eventlog.getMessage());
                        } else if ("Data".equals(targetProperty)) {
                            String data = new String(xmlr.getTextCharacters(), xmlr.getTextStart(),
                                    xmlr.getTextLength());
                            eventlog.getData().add(data);
                            m_log.debug("set Data : " + data);
                        } else {
                            m_log.debug("unknown target property : " + targetProperty);
                        }
                    } catch (NumberFormatException e) {
                        m_log.debug("number parse error", e);
                    }
                }
                targetProperty = null;
                break;
            default: // 
                break;
            }
            xmlr.next();
        }
        xmlr.close();
    } catch (XMLStreamException e) {
        m_log.warn("parseEvent() xmlstream error", e);
    }

    return eventlogs;

}

From source file:edu.harvard.iq.safe.lockss.impl.LOCKSSDaemonStatusTableXmlStreamParser.java

/**
 *
 * @param stream//from ww  w . jav a 2 s  . com
 * @param encoding
 */
@Override
public void read(InputStream stream, String encoding) {
    // logger.setLevel(Level.FINE);
    // 1. create Input factory
    XMLInputFactory xmlif = XMLInputFactory.newInstance();
    xmlif.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE);
    xmlif.setProperty("javax.xml.stream.isNamespaceAware", java.lang.Boolean.TRUE);

    long startTime = System.currentTimeMillis();

    int noAUs = 0;
    String aus = null;
    String currentTableId = null;
    String currentTableTitle = null;
    String currentTableKey = null;
    boolean hasErrorsColumn = false;
    String siAuId = null;
    XMLStreamReader xmlr = null;

    try {

        // create reader
        xmlr = xmlif.createXMLStreamReader(new BufferedInputStream(stream), encoding);

        String curElement = "";

        boolean isLastTagnameTable = false;
        String targetTagName = "row";
        String cellTagName = "columnname";
        boolean withinSummaryinfo = false;
        boolean withinColumndescriptor = false;
        boolean withinRow = false;
        boolean withinCell = false;
        boolean withinReference = false;
        boolean isCrawlStatusActive = false;
        boolean isCrawlStatusColumn = false;
        int valueTagCounter = 0;
        String currentColumnName = null;
        String currentCellValue = null;
        String currentCellKey = null;
        SummaryInfo si = null;

        List<String> rowData = null;
        Map<String, String> rowDataH = null;

        w1: while (xmlr.hasNext()) {
            int eventType = xmlr.next();
            switch (eventType) {
            case XMLStreamConstants.START_ELEMENT:
                curElement = xmlr.getLocalName(); // note: getName() ->
                // QName
                logger.log(Level.FINE, "--------- start tag = <{0}> ---------", curElement);
                // check the table name first
                if (curElement.equals("table")) {
                    isLastTagnameTable = true;
                } else if (curElement.equals("error")) {
                    isTargetPageValid = false;
                    break w1;
                }

                if (isLastTagnameTable) {
                    if (curElement.equals("name")) {
                        currentTableId = xmlr.getElementText();
                        logger.log(Level.FINE, "########### table Id = [{0}] ###########", currentTableId);
                        tableId = currentTableId;
                        if (belongsInclusionTableList.contains(currentTableId)) {
                            logger.log(Level.FINE, "!!!!! Table ({0}) belongs to the target list !!!!!",
                                    currentTableId);

                        } else {
                            logger.log(Level.FINE,
                                    "XXXXXXXXXXX Table ({0}) does not belong to the target list XXXXXXXXXXX",
                                    currentTableId);
                            break w1;
                        }
                    } else if (curElement.equals("key")) {
                        currentTableKey = xmlr.getElementText();
                        logger.log(Level.FINE, "---------- table key = ({0}) ----------", currentTableKey);
                        tableKey = currentTableKey;
                    } else if (curElement.equals("title")) {
                        currentTableTitle = xmlr.getElementText();
                        logger.log(Level.FINE, "+++++++++ table Title = ({0}) +++++++++", currentTableTitle);
                        if (currentTableId.equals("PeerRepair")) {
                            if (currentTableTitle.startsWith("Repair candidates for AU: ")) {
                                currentTableTitle = currentTableTitle.replaceFirst("Repair candidates for AU: ",
                                        "");
                                logger.log(Level.FINE, "save this modified table-Title as auName={0}",
                                        currentTableTitle);
                                this.tableTitle = currentTableTitle;
                            } else {
                                logger.log(Level.WARNING,
                                        "The table-Title does not start with the expected token={0}",
                                        currentTableTitle);
                            }
                        }
                        isLastTagnameTable = false;
                    }
                }

                if (curElement.equals("columndescriptor")) {
                    withinColumndescriptor = true;
                } else if (curElement.equals("row")) {
                    withinRow = true;
                    rowCounter++;
                    logger.log(Level.FINE, "================== {0}-th row starts here ==================",
                            rowCounter);
                    // set-up the table storage
                    //if (rowCounter == 1) {
                    // 1st row
                    rowData = new ArrayList<String>();
                    rowDataH = new LinkedHashMap<String, String>();
                    //}
                } else if (curElement.equals("cell")) {
                    logger.log(Level.FINE, "entering a cell");
                    withinCell = true;
                } else if (curElement.equals("reference")) {
                    withinReference = true;
                    logger.log(Level.FINE, "within reference on");
                } else if (curElement.equals("summaryinfo")) {
                    withinSummaryinfo = true;
                    si = new SummaryInfo();
                } else if (curElement.equals("value")) {
                    logger.log(Level.FINE, "entering a value");
                    valueTagCounter++;
                }
                //---- columndescriptor tag ---------------------------------------------------
                if (withinColumndescriptor) {
                    if (curElement.equals("name")) {

                        String nameText = xmlr.getElementText();
                        logger.log(Level.FINE, "\tcolumndescriptor: name = {0}", nameText);
                        columndescriptorList.add(nameText);
                    } else if (curElement.equals("title")) {
                        String titleText = xmlr.getElementText();
                        logger.log(Level.FINE, "\tcolumndescriptor: title = {0}", titleText);
                    } else if (curElement.equals("type")) {
                        String typeText = xmlr.getElementText();
                        logger.log(Level.FINE, "\tcolumndescriptor: type = {0}", typeText);
                        getTypeList().add(typeText);
                    }
                }
                //---- cell tag ----------------------------------------------------------------
                if (withinCell) {
                    logger.log(Level.FINE, "parsing withinCell");
                    if (curElement.equals("columnname")) {

                        String columnname = xmlr.getElementText();
                        logger.log(Level.FINE, "\t\tcolumnname = {0}", columnname);
                        currentColumnName = columnname;
                        if (columnname.equals("crawl_status")) {
                            isCrawlStatusColumn = true;
                        } else {
                            isCrawlStatusColumn = false;
                        }

                        if (columnname.equals("Errors")) {
                            hasErrorsColumn = true;
                        }

                    } else {
                        // value tag block: either value-tag WO a child element
                        // or with a child element
                        /*
                         * <value><reference>...<value>xxxx</value>
                         * <value>xxxx</value>
                         */
                        if ((curElement.equals("value")) && (!withinReference)) {
                            logger.log(Level.FINE, "entering el:value/WO-REF block");
                            if (!hasReferenceTag.contains(currentColumnName)) {
                                logger.log(Level.FINE, "No child reference tag is expected for this value tag");
                                logger.log(Level.FINEST, "xmlr.getEventType():pre-parsing={0}",
                                        xmlr.getEventType());
                                String cellValue = xmlr.getElementText();
                                // note: the above parsing action moves the
                                // cursor to the end-tag, i.e., </value>
                                // therefore, the end-element-switch-block below
                                // cannot catch this </value> tag

                                logger.log(Level.FINE, "\t\t\t[No ref: value] {0} = {1}",
                                        new Object[] { currentColumnName, cellValue });

                                currentCellValue = cellValue;
                                logger.log(Level.FINEST, "xmlr.getEventType():post-parsing={0}",
                                        xmlr.getEventType());
                                // store this value
                                // rowData
                                logger.log(Level.FINE, "current column name={0}", currentColumnName);
                                logger.log(Level.FINE, "valueTagCounter={0}", valueTagCounter);
                                if (currentColumnName.endsWith("Damaged")) {
                                    if (valueTagCounter <= 1) {
                                        // 2nd value tag is footnot for this column
                                        // ignore this value
                                        rowData.add(cellValue);
                                        rowDataH.put(currentColumnName, currentCellValue);
                                    }
                                } else {
                                    rowData.add(cellValue);
                                    rowDataH.put(currentColumnName, currentCellValue);
                                }
                            } else {
                                // previously this block was unthinkable, but
                                // it was found that there are columns that
                                // temporarily have a <reference> tag in
                                // crawl_status_table; these columns are
                                // included in hasReferenceTag by default;
                                // thus, for such unstable columns,
                                // when they hava a <reference tag,
                                // data are caputred in another within-
                                // reference block; however, when these
                                // columns no longer have <reference> tag,
                                // text data would be left uncaptured unless
                                // some follow-up processing takes place here
                                logger.log(Level.FINE, "May have to capture data: column={0}",
                                        currentColumnName);
                                if (mayHaveReferenceTag.contains(currentColumnName) && !isCrawlStatusActive) {
                                    // because the crawling is not active,
                                    // it is safely assume that the maybe columns have no reference tag

                                    // 2011-10-24 the above assumption was found wrong
                                    // a crawling cell does not say active but
                                    // subsequent columns have a reference
                                    logger.log(Level.FINE,
                                            "a text or a reference tag : try to parse it as a text");
                                    String cellValue = null;
                                    try {
                                        cellValue = xmlr.getElementText();
                                    } catch (javax.xml.stream.XMLStreamException ex) {
                                        continue;
                                    } finally {
                                    }
                                    logger.log(Level.FINE, "\t\t\t[value WO-ref(crawling_NOT_active case)={0}]",
                                            currentColumnName + " = " + cellValue);
                                    currentCellValue = cellValue;
                                    // store this value
                                    // rowData
                                    logger.log(Level.FINE, "\t\t\tcurrent columnName={0}", currentColumnName);
                                    rowData.add(cellValue);
                                    rowDataH.put(currentColumnName, currentCellValue);

                                } else {
                                    logger.log(Level.FINE, "WO-Ref: no processing items now:{0}", curElement);
                                }
                            }
                        } else if (withinReference) {
                            // reference tag exists
                            logger.log(Level.FINE, "WR:curElement={0}", curElement);

                            if (curElement.equals("key")) {
                                String cellKey = xmlr.getElementText();
                                logger.log(Level.FINE, "\t\tcurrentCellKey is set to={0}", cellKey);
                                currentCellKey = cellKey;
                            } else if (curElement.equals("value")) {
                                String cellValue = xmlr.getElementText();

                                logger.log(Level.FINE, "\t\twr: {0} = {1}",
                                        new Object[] { currentColumnName, cellValue });

                                // exception cases follow:
                                if (currentColumnName.equals("AuName")) {
                                    logger.log(Level.FINE, "\t\tAuName is replaced with the key[=AuId]= {0}",
                                            currentCellKey);
                                    // rowData                                  // This block is for ArchivalUnitStatusTable
                                    // add the key as a new datum (auId)
                                    // ahead of its value
                                    rowData.add(currentCellKey);
                                    rowDataH.put("AuId", currentCellKey);
                                    currentCellValue = cellValue;
                                } else if (currentColumnName.equals("auId")) {
                                    // This block is for V3PollerTable
                                    logger.log(Level.FINE, "\t\tnew value for auId(V3PollerTable)={0}",
                                            currentCellKey);
                                    // deprecated after 2012-02-02: use key as data
                                    // currentCellValue = currentCellKey;
                                    // add auName as a new column ahead of auId

                                    rowData.add(cellValue);
                                    rowDataH.put("auName", cellValue);
                                    logger.log(Level.FINE, "\t\tauName(V3PollerTable)={0}", cellValue);

                                    currentCellValue = currentCellKey;
                                } else if (currentColumnName.equals("pollId")) {
                                    // this block is for V3PollerTable
                                    logger.log(Level.FINE, "\t\tFull string (key) is used={0}", currentCellKey);
                                    // The key has the complete string whereas
                                    // the value is its truncated copy
                                    currentCellValue = currentCellKey;

                                } else if (currentColumnName.equals("au")) {
                                    logger.log(Level.FINE,
                                            "\t\tauId is used instead for au(crawl_status_table)={0}",
                                            currentCellKey);

                                    // 2012-02-02: add auName ahead of au
                                    rowData.add(cellValue);
                                    rowDataH.put("auName", cellValue);
                                    logger.log(Level.FINE, "\t\tauName={0}", cellValue);

                                    // rowData                                  // This block is for crawl_status_table
                                    // save the key(auId) instead of value
                                    currentCellValue = currentCellKey;

                                } else if (currentColumnName.equals("Peers")) {

                                    logger.log(Level.FINE, "\t\tURL (key) is used={0}", currentCellKey);
                                    currentCellValue = DaemonStatusDataUtil.escapeHtml(currentCellKey);
                                    logger.log(Level.FINE, "\t\tAfter encoding ={0}", currentCellValue);

                                } else {
                                    if (isCrawlStatusColumn) {
                                        // if the craw status column is
                                        // "active", some later columns
                                        // may have a reference tag
                                        // so turn on the switch
                                        if (cellValue.equals("Active") || (cellValue.equals("Pending"))) {
                                            isCrawlStatusActive = true;
                                        } else {
                                            isCrawlStatusActive = false;
                                        }
                                    }
                                    // the default processing
                                    currentCellValue = cellValue;
                                }
                                // store currentCellValue
                                logger.log(Level.FINE, "currentCellValue={0}", currentCellValue);
                                // rowData
                                rowData.add(currentCellValue);
                                rowDataH.put(currentColumnName, currentCellValue);
                            } // Within ref tag: key and valu processing
                        } // value with text or value with ref tag
                    } // columnname or value
                } // within cell
                // ---- summaryinfo tag --------------------------------------------------------
                if (withinSummaryinfo) {
                    logger.log(Level.FINE,
                            "============================ Within SummaryInfo ============================ ");
                    if (curElement.equals("title")) {
                        String text = xmlr.getElementText();
                        si.setTitle(text);

                        logger.log(Level.FINE, "\tsi:titile={0}", si.getTitle());
                    } else if (curElement.equals("type")) {
                        String text = xmlr.getElementText();
                        si.setType(Integer.parseInt(text));
                        logger.log(Level.FINE, "\tsi:type={0}", si.getType());
                    } else if (curElement.equals("key")) {
                        if (withinReference && si.getTitle().equals("Volume")) {
                            String text = xmlr.getElementText();
                            logger.log(Level.FINE, "\tsi:key contents(Volume case)={0}", text);
                            siAuId = text;
                            //                                    si.setValue(text);
                            logger.log(Level.FINE, "\tsi:value(Volume case)={0}", siAuId);
                        }
                    } else if (curElement.equals("value")) {
                        if (withinReference) {
                            if (hasRefTitileTagsSI.contains(si.getTitle())) {
                                if (si.getTitle().equals("Volume")) {
                                    // 2012-02-02 use the au name
                                    String text = xmlr.getElementText();
                                    si.setValue(text);
                                    logger.log(Level.FINE, "\tsi:value(Volume case)={0}", si.getValue());
                                } else {
                                    String text = xmlr.getElementText();
                                    si.setValue(text);
                                    logger.log(Level.FINE, "\tsi:value={0}", si.getValue());
                                }
                            }
                        } else {
                            // note: 2012-02-07
                            // daemon 1.59.2 uses the new layout for AU page
                            // this layout includes a summaryinfo tag
                            // that now contains a reference tag
                            String text = null;

                            try {
                                text = xmlr.getElementText();
                                if (!hasRefTitileTagsSI.contains(si.getTitle())) {
                                    si.setValue(text);
                                    logger.log(Level.FINE, "\tsi:value={0}", si.getValue());
                                }
                            } catch (javax.xml.stream.XMLStreamException ex) {
                                logger.log(Level.WARNING, "encounter a reference tag rather than text");
                                continue;
                            } finally {
                            }
                        }
                    }

                    /*
                     * aus = xmlr.getElementText();
                     * out.println("found token=[" + aus + "]"); if
                     * (currentTableId.equals("ArchivalUnitStatusTable")) {
                     * m = pau.matcher(aus); if (m.find()) {
                     * out.println("How many AUs=" + m.group(1)); noAUs =
                     * Integer.parseInt(m.group(1)); } else {
                     * out.println("not found within[" + aus + "]"); } }
                     */
                }

                break;
            case XMLStreamConstants.CHARACTERS:
                break;

            case XMLStreamConstants.ATTRIBUTE:
                break;

            case XMLStreamConstants.END_ELEMENT:
                if (xmlr.getLocalName().equals("columndescriptor")) {
                    withinColumndescriptor = false;
                    logger.log(Level.FINE, "leaving columndescriptor");
                } else if (xmlr.getLocalName().equals("row")) {
                    if (withinRow) {
                        logger.log(Level.FINE, "========= end of the target row element");
                        withinRow = false;
                    }
                    if (!isCrawlStatusActive) {
                        tabularData.add(rowData);
                        tableData.add(rowDataH);

                    } else {
                        rowIgnored++;
                        rowCounter--;
                    }
                    rowData = null;
                    rowDataH = null;
                    isCrawlStatusActive = false;
                } else if (xmlr.getLocalName().equals("cell")) {
                    // rowDataH.add(cellDatum);
                    cellCounter++;
                    withinCell = false;
                    currentColumnName = null;
                    currentCellValue = null;
                    currentCellKey = null;
                    isCrawlStatusColumn = false;
                    valueTagCounter = 0;
                    logger.log(Level.FINE, "leaving cell");
                } else if (xmlr.getLocalName().equals("columnname")) {
                    logger.log(Level.FINE, "leaving columnname");
                } else if (xmlr.getLocalName().equals("reference")) {
                    withinReference = false;
                } else if (xmlr.getLocalName().equals("summaryinfo")) {
                    logger.log(Level.FINE, "si={0}", si.toString());
                    summaryInfoList.add(si);
                    si = null;
                    withinSummaryinfo = false;
                } else if (xmlr.getLocalName().equals("value")) {
                    logger.log(Level.FINE, "leaving value");
                } else {
                    logger.log(Level.FINE, "--------- end tag = <{0}> ---------", curElement);
                }

                break;
            case XMLStreamConstants.END_DOCUMENT:
                logger.log(Level.FINE, "Total of {0} row occurrences", rowCounter);
            } // end: switch
        } // end:while
    } catch (XMLStreamException ex) {
        logger.log(Level.WARNING, "XMLStreamException occurs", ex);
        this.isTargetPageValid = false;

    } catch (RuntimeException re) {
        logger.log(Level.WARNING, "some RuntimeException occurs", re);
        this.isTargetPageValid = false;
    } catch (Exception e) {
        logger.log(Level.WARNING, "some Exception occurs", e);
        this.isTargetPageValid = false;
    } finally {
        // 5. close reader/IO
        if (xmlr != null) {
            try {
                xmlr.close();
            } catch (XMLStreamException ex) {
                logger.log(Level.WARNING, "XMLStreamException occurs during close()", ex);
            }
        }
        if (!this.isTargetPageValid) {
            logger.log(Level.WARNING,
                    "This parsing session may not be complete due to some exception reported earlier");
        }
    } // end of try

    if (currentTableId.equals("V3PollerDetailTable")) {
        summaryInfoList.add(new SummaryInfo("auId", 4, siAuId));
        summaryInfoMap = new LinkedHashMap<String, String>();
        for (SummaryInfo si : summaryInfoList) {
            summaryInfoMap.put(si.getTitle(), si.getValue());
        }
    }

    // parsing summary
    logger.log(Level.FINE, "###################### parsing summary ######################");
    logger.log(Level.FINE, "currentTableId={0}", currentTableId);
    logger.log(Level.FINE, "currentTableTitle={0}", currentTableTitle);
    logger.log(Level.FINE, "currentTableKey={0}", currentTableKey);

    logger.log(Level.FINE, "columndescriptorList={0}", columndescriptorList);
    logger.log(Level.FINE, "# of columndescriptors={0}", columndescriptorList.size());
    logger.log(Level.FINE, "typeList={0}", typeList);
    logger.log(Level.FINE, "# of rows counted={0}", rowCounter);
    logger.log(Level.FINE, "# of rows excluded[active ones are excluded]={0}", rowIgnored);
    logger.log(Level.FINE, "summaryInfoList:size={0}", summaryInfoList.size());
    logger.log(Level.FINE, "summaryInfoList={0}", summaryInfoList);
    logger.log(Level.FINE, "table: cell counts = {0}", cellCounter);
    logger.log(Level.FINE, "tableData[map]=\n{0}", tableData);
    logger.log(Level.FINE, "tabularData[list]=\n{0}", tabularData);

    /*
     * if (currentTableId.equals("ArchivalUnitStatusTable")) { if
     * (rowCounter == noAUs) { out.println("au counting is OK=" +
     * rowCounter); } else { err.println("au counting disagreement"); throw
     * new RuntimeException("parsing error is suspected"); } }
     */
    logger.log(Level.FINE, " completed in {0} ms\n\n", (System.currentTimeMillis() - startTime));

    if (!columndescriptorList.isEmpty()) {
        int noCols = columndescriptorList.size();
        if (currentTableId.equals("V3PollerTable") && !hasErrorsColumn) {
            noCols--;
        }
        int noCellsExpd = rowCounter * noCols;
        if (noCols > 0) {
            // this table has a table
            logger.log(Level.FINE, "checking parsing results: table dimmensions");
            if (noCellsExpd == cellCounter) {
                logger.log(Level.FINE, "table dimensions and cell-count are consistent");
            } else {
                int diff = noCellsExpd - cellCounter;
                logger.log(Level.FINE, "The table has {0} incomplete cells", diff);
                hasIncompleteRows = true;
                setIncompleteRowList();
                logger.log(Level.FINE, "incomplete rows: {0}", incompleteRows);
            }
        }
    }
}

From source file:com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester.java

/**
 * processMeta - handle an individual field
 *//*from   w  w  w .ja  v a2 s. co  m*/
private void processMeta(DocumentPojo f, metaField m, String text, SourcePojo source,
        UnstructuredAnalysisConfigPojo uap) {

    boolean bAllowDuplicates = false;
    if ((null != m.flags) && m.flags.contains("U")) {
        bAllowDuplicates = true;
    }
    if ((null == m.scriptlang) || m.scriptlang.equalsIgnoreCase("regex")) {

        Pattern metaPattern = createRegex(m.script, m.flags);

        int timesToRun = 1;
        Object[] currField = null;
        if ((null != m.flags) && m.flags.contains("c")) {
            currField = f.getMetadata().get(m.fieldName);
        }
        if (null != currField) { // chained metadata
            timesToRun = currField.length;
            text = (String) currField[0];
        } //TESTED

        Matcher matcher = metaPattern.matcher(text);
        LinkedList<String> Llist = null;

        for (int ii = 0; ii < timesToRun; ++ii) {
            if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
                text = (String) currField[ii];
                matcher = metaPattern.matcher(text);
            } //TESTED

            StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
            int nFieldNameLen = m.fieldName.length() + 1;

            try {
                while (matcher.find()) {
                    if (null == Llist) {
                        Llist = new LinkedList<String>();
                    }
                    if (null == m.groupNum) {
                        m.groupNum = 0;
                    }
                    String toAdd = matcher.group(m.groupNum);
                    if (null != m.replace) {
                        toAdd = metaPattern.matcher(toAdd).replaceFirst(m.replace);
                    }
                    if ((null != m.flags) && m.flags.contains("H")) {
                        toAdd = StringEscapeUtils.unescapeHtml(toAdd);
                    }
                    prefix.setLength(nFieldNameLen);
                    prefix.append(toAdd);
                    String dupCheck = prefix.toString();

                    if (!regexDuplicates.contains(dupCheck)) {
                        Llist.add(toAdd);
                        if (!bAllowDuplicates) {
                            regexDuplicates.add(dupCheck);
                        }
                    }
                }
            } catch (Exception e) {
                this._context.getHarvestStatus().logMessage("processMeta1: " + e.getMessage(), true);
            }
        } //(end metadata chaining handling)
        if (null != Llist) {
            if (null != currField) { // (overwrite)
                f.getMetadata().put(m.fieldName, Llist.toArray());
            } else {
                f.addToMetadata(m.fieldName, Llist.toArray());
            }
        } //TESTED
    } else if (m.scriptlang.equalsIgnoreCase("javascript")) {
        if (null == f.getMetadata()) {
            f.setMetadata(new LinkedHashMap<String, Object[]>());
        }
        //set the script engine up if necessary
        if ((null != source) && (null != uap)) {
            //(these are null if called from new processing pipeline vs legacy code)
            intializeScriptEngine(source, uap);
        }

        try {
            //TODO (INF-2488): in new format, this should only happen in between contentMeta blocks/docs
            // (also should be able to use SAH _document object I think?)

            // Javascript: the user passes in 
            Object[] currField = f.getMetadata().get(m.fieldName);
            if ((null == m.flags) || m.flags.isEmpty()) {
                if (null == currField) {
                    engine.put("text", text);
                    engine.put("_iterator", null);
                }
                //(otherwise will just pass the current fields in there)
            } else { // flags specified
                if (m.flags.contains("t")) { // text
                    engine.put("text", text);
                }
                if (m.flags.contains("d")) { // entire document (minus ents and assocs)
                    GsonBuilder gb = new GsonBuilder();
                    Gson g = gb.create();
                    List<EntityPojo> ents = f.getEntities();
                    List<AssociationPojo> assocs = f.getAssociations();
                    try {
                        f.setEntities(null);
                        f.setAssociations(null);
                        engine.put("document", g.toJson(f));
                        securityManager.eval(engine, JavaScriptUtils.initScript);
                    } finally {
                        f.setEntities(ents);
                        f.setAssociations(assocs);
                    }
                }
                if (m.flags.contains("m")) { // metadata
                    GsonBuilder gb = new GsonBuilder();
                    Gson g = gb.create();
                    engine.put("_metadata", g.toJson(f.getMetadata()));
                    securityManager.eval(engine, JavaScriptUtils.iteratorMetaScript);
                }
            } //(end flags processing)

            if (null != currField) {
                f.getMetadata().remove(m.fieldName);

                GsonBuilder gb = new GsonBuilder();
                Gson g = gb.create();
                engine.put("_iterator", g.toJson(currField));
                securityManager.eval(engine, JavaScriptUtils.iteratorDocScript);
            }
            //TESTED (handling of flags, and replacing of existing fields, including when field is null but specified)

            Object returnVal = securityManager.eval(engine, m.script);

            if (null != returnVal) {
                if (returnVal instanceof String) { // The only easy case
                    Object[] array = new Object[1];
                    if ((null != m.flags) && m.flags.contains("H")) {
                        returnVal = StringEscapeUtils.unescapeHtml((String) returnVal);
                    }
                    array[0] = returnVal;
                    f.addToMetadata(m.fieldName, array);
                } else { // complex object or array - in either case the engine turns these into
                         // internal.NativeArray or internal.NativeObject

                    BasicDBList outList = JavaScriptUtils.parseNativeJsObject(returnVal, engine);
                    f.addToMetadata(m.fieldName, outList.toArray());
                }
            }
        } catch (ScriptException e) {

            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(),
                    true);

            // Just do nothing and log
            // e.printStackTrace();
            //DEBUG (don't output log messages per doc)
            //logger.error(e.getMessage());
        } catch (Exception e) {

            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(),
                    true);

            // Just do nothing and log
            // e.printStackTrace();
            //DEBUG (don't output log messages per doc)
            //logger.error(e.getMessage());
        }
    } else if (m.scriptlang.equalsIgnoreCase("xpath")) {

        String xpath = m.script;

        try {
            createHtmlCleanerIfNeeded();

            int timesToRun = 1;
            Object[] currField = null;
            if ((null != m.flags) && m.flags.contains("c")) {
                currField = f.getMetadata().get(m.fieldName);
            }
            if (null != currField) { // chained metadata
                f.getMetadata().remove(m.fieldName); // (so will add to the end)
                timesToRun = currField.length;
                text = (String) currField[0];
            } //TESTED

            for (int ii = 0; ii < timesToRun; ++ii) {
                if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
                    text = (String) currField[ii];
                } //TESTED

                TagNode node = cleaner.clean(new ByteArrayInputStream(text.getBytes()));

                //NewCode : Only use html cleaner for cleansing
                //use JAXP for full Xpath lib
                Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);

                String extraRegex = extractRegexFromXpath(xpath);

                if (extraRegex != null)
                    xpath = xpath.replace(extraRegex, "");

                XPath xpa = XPathFactory.newInstance().newXPath();
                NodeList res = (NodeList) xpa.evaluate(xpath, doc, XPathConstants.NODESET);

                if (res.getLength() > 0) {
                    if ((null != m.flags) && (m.flags.contains("o"))) { // "o" for object
                        m.groupNum = -1; // (see bConvertToObject below)
                    }
                    StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
                    int nFieldNameLen = m.fieldName.length() + 1;
                    ArrayList<Object> Llist = new ArrayList<Object>(res.getLength());
                    boolean bConvertToObject = ((m.groupNum != null) && (m.groupNum == -1));
                    boolean convertToXml = ((null != m.flags) && (m.flags.contains("x")));
                    for (int i = 0; i < res.getLength(); i++) {
                        Node info_node = res.item(i);
                        if ((null != m.flags) && (m.flags.contains("g"))) {
                            Llist.add(parseHtmlTable(info_node, m.replace));
                        } else if (bConvertToObject || convertToXml) {
                            // Try to create a JSON object out of this
                            StringWriter writer = new StringWriter();
                            try {
                                Transformer transformer = TransformerFactory.newInstance().newTransformer();
                                transformer.transform(new DOMSource(info_node), new StreamResult(writer));
                            } catch (TransformerException e1) {
                                continue;
                            }

                            if (bConvertToObject) {
                                try {
                                    JSONObject subObj = XML.toJSONObject(writer.toString());
                                    if (xpath.endsWith("*")) { // (can have any number of different names here)
                                        Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj));
                                    } //TESTED
                                    else {
                                        String[] rootNames = JSONObject.getNames(subObj);
                                        if (1 == rootNames.length) {
                                            // (don't think it can't be any other number in fact)
                                            subObj = subObj.getJSONObject(rootNames[0]);
                                        }
                                        boolean bUnescapeHtml = ((null != m.flags) && m.flags.contains("H"));
                                        Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj,
                                                bUnescapeHtml));
                                    } //TESTED
                                } catch (JSONException e) { // Just carry on
                                    continue;
                                }
                                //TESTED
                            } else { // leave in XML form
                                Llist.add(writer.toString().substring(38)); // +38: (step over <?xml version="1.0" encoding="UTF-8"?>)
                            } //TESTED (xpath_test.json)
                        } else { // Treat this as string, either directly or via regex
                            String info = info_node.getTextContent().trim();
                            if (extraRegex == null || extraRegex.isEmpty()) {
                                prefix.setLength(nFieldNameLen);
                                prefix.append(info);
                                String dupCheck = prefix.toString();

                                if (!regexDuplicates.contains(dupCheck)) {
                                    if ((null != m.flags) && m.flags.contains("H")) {
                                        info = StringEscapeUtils.unescapeHtml(info);
                                    }
                                    Llist.add(info);
                                    if (!bAllowDuplicates) {
                                        regexDuplicates.add(dupCheck);
                                    }
                                }
                            } else { // Apply regex to the string
                                Pattern dataRegex = createRegex(extraRegex, m.flags);
                                Matcher dataMatcher = dataRegex.matcher(info);
                                boolean result = dataMatcher.find();
                                while (result) {
                                    String toAdd;
                                    if (m.groupNum != null)
                                        toAdd = dataMatcher.group(m.groupNum);
                                    else
                                        toAdd = dataMatcher.group();
                                    prefix.setLength(nFieldNameLen);
                                    prefix.append(toAdd);
                                    String dupCheck = prefix.toString();

                                    if (!regexDuplicates.contains(dupCheck)) {
                                        if ((null != m.flags) && m.flags.contains("H")) {
                                            toAdd = StringEscapeUtils.unescapeHtml(toAdd);
                                        }
                                        Llist.add(toAdd);
                                        if (!bAllowDuplicates) {
                                            regexDuplicates.add(dupCheck);
                                        }
                                    }

                                    result = dataMatcher.find();
                                }
                            } //(regex vs no regex)
                        } //(end string vs object)
                    }
                    if (Llist.size() > 0) {
                        f.addToMetadata(m.fieldName, Llist.toArray());
                    }
                }
            } //(end loop over metadata objects if applicable)

        } catch (IOException ioe) {
            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(ioe).toString(),
                    true);

            // Just do nothing and log
            //DEBUG (don't output log messages per doc)
            //logger.error(ioe.getMessage());
        } catch (ParserConfigurationException e1) {
            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(),
                    true);
            // Just do nothing and log
            //DEBUG (don't output log messages per doc)
            //logger.error(e1.getMessage());
        } catch (XPathExpressionException e1) {
            _context.getHarvestStatus().logMessage("Error evaluating xpath expression: " + xpath, true);
        }
    } else if (m.scriptlang.equalsIgnoreCase("stream")) { // XML or JSON streaming interface
        // which one?
        try {
            boolean json = false;
            boolean xml = false;
            for (int i = 0; i < 128; ++i) {
                if ('<' == text.charAt(i)) {
                    xml = true;
                    break;
                }
                if ('{' == text.charAt(i) || '[' == text.charAt(i)) {
                    json = true;
                    break;
                }
                if (!Character.isSpaceChar(text.charAt(i))) {
                    break;
                }
            } //TESTED (too many spaces: meta_stream_test, test4; incorrect chars: test3, xml: test1, json: test2)

            boolean textNotObject = m.flags == null || !m.flags.contains("o");

            List<DocumentPojo> docs = new LinkedList<DocumentPojo>();
            List<String> levelOneFields = null;
            if (null != m.script) {
                levelOneFields = Arrays.asList(m.script.split("\\s*,\\s*"));
                if ((1 == levelOneFields.size()) && levelOneFields.get(0).isEmpty()) {
                    // convert [""] to null
                    levelOneFields = null;
                }
            } //TESTED (json and xml)

            if (xml) {
                XmlToMetadataParser parser = new XmlToMetadataParser(levelOneFields, null, null, null, null,
                        null, Integer.MAX_VALUE);
                XMLInputFactory factory = XMLInputFactory.newInstance();
                factory.setProperty(XMLInputFactory.IS_COALESCING, true);
                factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
                XMLStreamReader reader = null;
                try {
                    reader = factory.createXMLStreamReader(new ByteArrayInputStream(text.getBytes()));
                    docs = parser.parseDocument(reader, textNotObject);
                } finally {
                    if (null != reader)
                        reader.close();
                }
            } //TESTED (meta_stream_test, test1)
            if (json) {
                JsonReader jsonReader = null;
                try {
                    JsonToMetadataParser parser = new JsonToMetadataParser(null, levelOneFields, null, null,
                            Integer.MAX_VALUE);
                    jsonReader = new JsonReader(
                            new InputStreamReader(new ByteArrayInputStream(text.getBytes()), "UTF-8"));
                    jsonReader.setLenient(true);
                    docs = parser.parseDocument(jsonReader, textNotObject);
                } finally {
                    if (null != jsonReader)
                        jsonReader.close();
                }
            } //TESTED (meta_stream_test test2)

            if (!docs.isEmpty()) {
                ArrayList<String> Llist = null;
                ArrayList<Object> LlistObj = null;
                if (textNotObject) {
                    Llist = new ArrayList<String>(docs.size());
                } else {
                    LlistObj = new ArrayList<Object>(docs.size());
                }
                for (DocumentPojo doc : docs) {
                    if ((null != doc.getFullText()) || (null != doc.getMetadata())) {
                        if (textNotObject) {
                            Llist.add(doc.getFullText());
                        } //TESTED
                        else if (xml) {
                            LlistObj.add(doc.getMetadata());
                        } //TESTED
                        else if (json) {
                            Object o = doc.getMetadata();
                            if (null != o) {
                                o = doc.getMetadata().get("json");
                                if (o instanceof Object[]) {
                                    LlistObj.addAll(Arrays.asList((Object[]) o));
                                } else if (null != o) {
                                    LlistObj.add(o);
                                } //TESTED
                            }
                        } //TESTED
                    }
                } //TESTED
                if ((null != Llist) && !Llist.isEmpty()) {
                    f.addToMetadata(m.fieldName, Llist.toArray());
                } //TESTED
                if ((null != LlistObj) && !LlistObj.isEmpty()) {
                    f.addToMetadata(m.fieldName, LlistObj.toArray());
                } //TESTED

            } //TESTED (meta_stream_test test1,test2)
        } //(end try)
        catch (Exception e) { // various parsing errors
            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(),
                    true);
        }
    } //TESTED (meta_stream_test)

    // (don't currently support other script types)
}

From source file:net.xy.jcms.controller.configurations.parser.TranslationParser.java

/**
 * parses an xml configuration from an input streams. throwes
 * IllegalArgumentExceptions in case of syntax error.
 * /*from w  w  w  . j  a  v  a  2s. co m*/
 * @param in
 * @return value
 * @throws XMLStreamException
 * @throws ClassNotFoundException
 *             in case there are problems with an params type converter
 */
public static TranslationRule[] parse(final InputStream in, final ClassLoader loader)
        throws XMLStreamException, ClassNotFoundException {
    @SuppressWarnings("deprecation")
    final XMLInputFactory factory = XMLInputFactory.newInstance(
            "com.sun.xml.internal.stream.XMLInputFactoryImpl", TranslationParser.class.getClassLoader());
    LOG.info("XMLInputFactory loaded: " + factory.getClass().getName());
    factory.setProperty("javax.xml.stream.isCoalescing", true);
    // not supported be the reference implementation
    // factory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.TRUE);
    final XMLStreamReader parser = factory.createXMLStreamReader(in);
    while (parser.hasNext()) {
        final int event = parser.next();
        if (event == XMLStreamConstants.START_ELEMENT && parser.getName().getLocalPart().equals("rules")) {
            return parseRules(parser, loader);
        }
    }
    throw new IllegalArgumentException("No rules section found.");
}

From source file:net.xy.jcms.controller.configurations.parser.TranslationParser.java

/**
 * parses an single file translation//from  ww  w. ja  v  a2s. c o m
 * 
 * @param in
 * @param loader
 * @return value
 * @throws XMLStreamException
 * @throws ClassNotFoundException
 *             in case there are problems with an params type converter
 */
public static TranslationRule parseSingle(final InputStream in, final ClassLoader loader)
        throws XMLStreamException, ClassNotFoundException {
    @SuppressWarnings("deprecation")
    final XMLInputFactory factory = XMLInputFactory.newInstance(
            "com.sun.xml.internal.stream.XMLInputFactoryImpl", TranslationParser.class.getClassLoader());
    LOG.info("XMLInputFactory loaded: " + factory.getClass().getName());
    factory.setProperty("javax.xml.stream.isCoalescing", true);
    final XMLStreamReader parser = factory.createXMLStreamReader(in);
    while (parser.hasNext()) {
        final int event = parser.next();
        if (event == XMLStreamConstants.START_ELEMENT && parser.getName().getLocalPart().equals("rule")) {
            return parseRule(parser, loader);
        }
    }
    throw new IllegalArgumentException("No rules section found.");
}

From source file:net.xy.jcms.controller.configurations.parser.UsecaseParser.java

/**
 * parses usecases out from an xml file/*from w  w w  .  j a va 2 s. c  o  m*/
 * 
 * @param in
 * @param loader
 *            used for retrieving configuration included resources and also
 *            for retrieving the controllers
 * @return value
 * @throws XMLStreamException
 * @throws ClassNotFoundException
 */
public static Usecase[] parse(final InputStream in, final ClassLoader loader)
        throws XMLStreamException, ClassNotFoundException {
    final XMLInputFactory factory = XMLInputFactory.newInstance();
    factory.setProperty("javax.xml.stream.isCoalescing", true);
    // not supported by the reference implementation
    // factory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.TRUE);
    final XMLStreamReader parser = factory.createXMLStreamReader(in);
    while (parser.hasNext()) {
        final int event = parser.next();
        if (event == XMLStreamConstants.START_ELEMENT && parser.getName().getLocalPart().equals("usecases")) {
            return parseUsecases(parser, loader);
        }
    }
    throw new IllegalArgumentException("No usecases section found. [" + parser.getLocation() + "]");
}

From source file:net.xy.jcms.controller.configurations.parser.UsecaseParser.java

/**
 * method for parsing single usecase xml files. one per file.
 * // w  ww  .ja  v a2s  .  c  o m
 * @param in
 * @param loader
 * @return parsed usecase
 * @throws XMLStreamException
 * @throws ClassNotFoundException
 */
public static Usecase parseSingle(final InputStream in, final ClassLoader loader)
        throws XMLStreamException, ClassNotFoundException {
    final XMLInputFactory factory = XMLInputFactory.newInstance();
    factory.setProperty("javax.xml.stream.isCoalescing", true);
    // not supported by the reference implementation
    // factory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.TRUE);
    final XMLStreamReader parser = factory.createXMLStreamReader(in);
    while (parser.hasNext()) {
        final int event = parser.next();
        if (event == XMLStreamConstants.START_ELEMENT && parser.getName().getLocalPart().equals("usecase")) {
            return parseUsecase(parser, loader);
        }
    }
    throw new IllegalArgumentException("No usecases section found. [" + parser.getLocation() + "]");
}

From source file:org.activiti.bpmn.converter.BpmnXMLConverter.java

public BpmnModel convertToBpmnModel(InputStreamProvider inputStreamProvider, boolean validateSchema,
        boolean enableSafeBpmnXml, String encoding) {
    XMLInputFactory xif = XMLInputFactory.newInstance();

    if (xif.isPropertySupported(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES)) {
        xif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, false);
    }/*from   www .  j  ava2  s.com*/

    if (xif.isPropertySupported(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES)) {
        xif.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
    }

    if (xif.isPropertySupported(XMLInputFactory.SUPPORT_DTD)) {
        xif.setProperty(XMLInputFactory.SUPPORT_DTD, false);
    }

    InputStreamReader in = null;
    try {
        in = new InputStreamReader(inputStreamProvider.getInputStream(), encoding);
        XMLStreamReader xtr = xif.createXMLStreamReader(in);

        try {
            if (validateSchema) {

                if (!enableSafeBpmnXml) {
                    validateModel(inputStreamProvider);
                } else {
                    validateModel(xtr);
                }

                // The input stream is closed after schema validation
                in = new InputStreamReader(inputStreamProvider.getInputStream(), encoding);
                xtr = xif.createXMLStreamReader(in);
            }

        } catch (Exception e) {
            throw new RuntimeException("Could not validate XML with BPMN 2.0 XSD", e);
        }

        // XML conversion
        return convertToBpmnModel(xtr);
    } catch (UnsupportedEncodingException e) {
        throw new RuntimeException("The bpmn 2.0 xml is not UTF8 encoded", e);
    } catch (XMLStreamException e) {
        throw new RuntimeException("Error while reading the BPMN 2.0 XML", e);
    } finally {
        if (in != null) {
            try {
                in.close();
            } catch (IOException e) {
                LOGGER.debug("Problem closing BPMN input stream", e);
            }
        }
    }
}