Example usage for javax.xml.stream XMLInputFactory setProperty

List of usage examples for javax.xml.stream XMLInputFactory setProperty

Introduction

In this page you can find the example usage for javax.xml.stream XMLInputFactory setProperty.

Prototype

public abstract void setProperty(java.lang.String name, Object value) throws java.lang.IllegalArgumentException;

Source Link

Document

Allows the user to set specific feature/property on the underlying implementation.

Usage

From source file:edu.unc.lib.dl.util.TripleStoreQueryServiceMulgaraImpl.java

/**
 * @param query//from  w w  w .j av a 2s .  c  om
 *            an ITQL command
 * @return the message returned by Mulgara
 * @throws RemoteException
 *             for communication failure
 */
public String storeCommand(String query) {
    String result = null;
    String response = this.sendTQL(query);
    if (response != null) {
        XMLInputFactory factory = XMLInputFactory.newInstance();
        factory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
        try (StringReader sr = new StringReader(response)) {
            XMLEventReader r = factory.createXMLEventReader(sr);
            boolean inMessage = false;
            StringBuffer message = new StringBuffer();
            while (r.hasNext()) {
                XMLEvent e = r.nextEvent();
                if (e.isStartElement()) {
                    StartElement s = e.asStartElement();
                    if ("message".equals(s.getName().getLocalPart())) {
                        inMessage = true;
                    }
                } else if (e.isEndElement()) {
                    EndElement end = e.asEndElement();
                    if ("message".equals(end.getName().getLocalPart())) {
                        inMessage = false;
                    }
                } else if (inMessage && e.isCharacters()) {
                    message.append(e.asCharacters().getData());
                }
            }
            r.close();
            result = message.toString();
        } catch (XMLStreamException e) {
            e.printStackTrace();
        }
    }
    return result;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.file.FileHarvester.java

private void parse(InfiniteFile f, SourcePojo source) throws MalformedURLException, URISyntaxException {

    //NOTE: we only ever break out of here because of max docs in standalone mode
    // (because we don't know how to continue reading)

    DocumentPojo doc = null;/*w  ww .  j  a v  a  2s . c o m*/
    //Determine File Extension
    String fileName = f.getName().toString();

    int mid = fileName.lastIndexOf(".");
    String extension = fileName.substring(mid + 1, fileName.length());

    //Checked to save processing time
    long fileTimestamp = (f.getDate() / 1000) * 1000;
    // (ensure truncated to seconds, since some operation somewhere hear does this...)

    Date modDate = new Date(fileTimestamp);
    //XML Data gets placed into MetaData

    boolean bIsXml = false;
    boolean bIsJson = false;
    boolean bIsLineOriented = false;
    if ((null != source.getFileConfig()) && (null != source.getFileConfig().type)) {
        extension = source.getFileConfig().type;
    }
    bIsXml = extension.equalsIgnoreCase("xml");
    bIsJson = extension.equalsIgnoreCase("json");
    bIsLineOriented = extension.endsWith("sv");

    if (bIsXml || bIsJson || bIsLineOriented) {
        int debugMaxDocs = Integer.MAX_VALUE; // by default don't set this, it's only for debug mode
        if (_context.isStandalone()) { // debug mode
            debugMaxDocs = maxDocsPerCycle;
        }
        //fast check to see if the file has changed before processing (or if it never existed)
        if (needsUpdated_SourceUrl(modDate, f.getUrlString(), source)) {
            if (0 != modDate.getTime()) { // if it ==0 then sourceUrl doesn't exist at all, no need to delete
                // This file already exists - in normal/managed mode will re-create
                // In streaming mode, simple skip over
                if (_streaming) {
                    return;
                } //TESTED

                DocumentPojo docRepresentingSrcUrl = new DocumentPojo();
                docRepresentingSrcUrl.setSourceUrl(f.getUrlString());
                docRepresentingSrcUrl.setSourceKey(source.getKey());
                docRepresentingSrcUrl.setCommunityId(source.getCommunityIds().iterator().next());
                sourceUrlsGettingUpdated.add(docRepresentingSrcUrl.getSourceUrl());
                this.docsToRemove.add(docRepresentingSrcUrl);
                // (can add documents with just source URL, are treated differently in the core libraries)               
            }

            SourceFileConfigPojo fileSystem = source.getFileConfig();
            if ((null == fileSystem) && (bIsXml || bIsJson)) {
                fileSystem = new SourceFileConfigPojo();
            }
            XmlToMetadataParser xmlParser = null;
            JsonToMetadataParser jsonParser = null;
            String urlType = extension;
            if (bIsXml) {
                xmlParser = new XmlToMetadataParser(fileSystem.XmlRootLevelValues, fileSystem.XmlIgnoreValues,
                        fileSystem.XmlSourceName, fileSystem.XmlPrimaryKey, fileSystem.XmlAttributePrefix,
                        fileSystem.XmlPreserveCase, debugMaxDocs);
            } //TESTED
            else if (bIsJson) {
                jsonParser = new JsonToMetadataParser(fileSystem.XmlSourceName, fileSystem.XmlRootLevelValues,
                        fileSystem.XmlPrimaryKey, fileSystem.XmlIgnoreValues, debugMaxDocs);
            } //TESTED

            List<DocumentPojo> partials = null;
            try {
                if (bIsXml) {
                    XMLStreamReader xmlStreamReader = null;
                    XMLInputFactory factory = XMLInputFactory.newInstance();
                    factory.setProperty(XMLInputFactory.IS_COALESCING, true);
                    factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
                    try {
                        xmlStreamReader = factory.createXMLStreamReader(f.getInputStream());
                        partials = xmlParser.parseDocument(xmlStreamReader);
                        long memUsage = xmlParser.getMemUsage();
                        _memUsage += memUsage;
                        _totalMemUsage.addAndGet(memUsage);
                    } finally {
                        if (null != xmlStreamReader)
                            xmlStreamReader.close();
                    }
                } //TESTED
                else if (bIsJson) {
                    JsonReader jsonReader = null;
                    try {
                        jsonReader = new JsonReader(new InputStreamReader(f.getInputStream(), "UTF-8"));
                        jsonReader.setLenient(true);
                        partials = jsonParser.parseDocument(jsonReader);
                        long memUsage = jsonParser.getMemUsage();
                        _memUsage += memUsage;
                        _totalMemUsage.addAndGet(memUsage);
                    } finally {
                        if (null != jsonReader)
                            jsonReader.close();
                    }
                } //TESTED
                else if (bIsLineOriented) { // Just generate a document for every line

                    BufferedReader lineReader = null;
                    try {
                        lineReader = new BufferedReader(new InputStreamReader(f.getInputStream(), "UTF-8"));
                        CsvToMetadataParser lineParser = new CsvToMetadataParser(debugMaxDocs);
                        partials = lineParser.parseDocument(lineReader, source);
                        long memUsage = lineParser.getMemUsage();
                        _memUsage += memUsage;
                        _totalMemUsage.addAndGet(memUsage);
                    } finally {
                        if (null != lineReader)
                            lineReader.close();
                    }
                } //TESTED

                MessageDigest md5 = null; // (generates unique urls if the user doesn't below)
                try {
                    md5 = MessageDigest.getInstance("MD5");
                } catch (NoSuchAlgorithmException e) {
                    // Do nothing, unlikely to happen...
                }
                int nIndex = 0;
                int numPartials = partials.size();
                for (DocumentPojo doctoAdd : partials) {
                    nIndex++;
                    doctoAdd.setSource(source.getTitle());
                    doctoAdd.setSourceKey(source.getKey());
                    doctoAdd.setMediaType(source.getMediaType());
                    doctoAdd.setModified(new Date(fileTimestamp));
                    doctoAdd.setCreated(new Date());

                    if (null == doctoAdd.getUrl()) { // Can be set in the parser or here
                        doctoAdd.setHasDefaultUrl(true); // (ie cannot occur in a different src URL)

                        if (1 == numPartials) {
                            String urlString = f.getUrlString();
                            if (urlString.endsWith(urlType)) {
                                doctoAdd.setUrl(urlString);
                            } else {
                                doctoAdd.setUrl(
                                        new StringBuffer(urlString).append('.').append(urlType).toString());
                            }
                            // (we always set sourceUrl as the true url of the file, so want to differentiate the URL with
                            //  some useful information)
                        } else if (null == doctoAdd.getMetadata()) { // Line oriented case
                            doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/").append(nIndex)
                                    .append('.').append(urlType).toString());
                        } else {
                            if (null == md5) { // Will never happen, MD5 always exists
                                doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/")
                                        .append(doctoAdd.getMetadata().hashCode()).append('.').append(urlType)
                                        .toString());
                            } else { // This is the standard call if the XML parser has not been configured to build the URL
                                doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/")
                                        .append(DigestUtils.md5Hex(doctoAdd.getMetadata().toString()))
                                        .append('.').append(urlType).toString());
                            }
                        } //TESTED
                    }
                    doctoAdd.setTitle(f.getName().toString());
                    doctoAdd.setPublishedDate(new Date(fileTimestamp));
                    doctoAdd.setSourceUrl(f.getUrlString());

                    // Always add to files because I'm deleting the source URL
                    files.add(doctoAdd);
                } //TESTED 

            } catch (XMLStreamException e1) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
            } catch (FactoryConfigurationError e1) {
                errors++;
                _context.getHarvestStatus().logMessage(e1.getMessage(), true);

            } catch (IOException e1) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
            } catch (Exception e1) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
            }
        } //(end if needs updated)
    } else //Tika supports Excel,Word,Powerpoint,Visio, & Outlook Documents
    {
        // (This dedup tells me if it's an add/update vs ignore - qr.isDuplicate higher up tells me if I need to add or update)
        if (needsUpdated_Url(modDate, f.getUrlString(), source)) {

            Metadata metadata = null;
            InputStream in = null;
            try {

                doc = new DocumentPojo();

                // Create a tika object (first time only)
                if (null == _tika) {
                    this.initializeTika(_context, source);
                }

                // BUGGERY
                // NEED TO LIKELY SET LIMIT TO BE 30MB or 50MB and BYPASS ANYTHING OVER THAT BELOW IS THE CODE TO DO THAT
                // tika.setMaxStringLength(30*1024*1024);
                // Disable the string length limit
                _tika.setMaxStringLength(-1);
                //input = new FileInputStream(new File(resourceLocation));
                // Create a metadata object to contain the metadata

                metadata = new Metadata();
                // Parse the file and get the text of the file
                doc.setSource(source.getTitle());
                doc.setSourceKey(source.getKey());
                doc.setMediaType(source.getMediaType());
                String fullText = "";

                in = f.getInputStream();
                try {
                    if (null == _tikaOutputFormat) { // text only
                        fullText = _tika.parseToString(in, metadata);
                    } //TESTED
                    else { // XML/HMTL
                        _tika.getParser().parse(in, _tikaOutputFormat, metadata, _tikaOutputParseContext);
                        fullText = _tikaXmlFormatWriter.toString();
                        _tikaXmlFormatWriter.getBuffer().setLength(0);
                    } //TESTED
                } finally {
                    if (null != in)
                        in.close();
                }
                int descCap = 500;
                doc.setFullText(fullText);
                if (descCap > fullText.length()) {
                    descCap = fullText.length();
                }
                doc.setDescription(fullText.substring(0, descCap));
                doc.setModified(new Date(fileTimestamp));
                doc.setCreated(new Date());
                doc.setUrl(f.getUrlString());
                doc.setTitle(f.getName().toString());
                doc.setPublishedDate(new Date(fileTimestamp));

                long memUsage = (250L * (doc.getFullText().length() + doc.getDescription().length())) / 100L; // 25% overhead, 2x for string->byte
                _memUsage += memUsage;
                _totalMemUsage.addAndGet(memUsage);

                // If the metadata contains a more plausible date then use that
                try {
                    String title = metadata.get(Metadata.TITLE);
                    if (null != title) {
                        doc.setTitle(title);
                    }
                } catch (Exception e) { // Fine just carry on                  
                }
                try {
                    Date date = metadata.getDate(Metadata.CREATION_DATE); // MS Word
                    if (null != date) {
                        doc.setPublishedDate(date);
                    } else {
                        date = metadata.getDate(Metadata.DATE); // Dublin
                        if (null != date) {
                            doc.setPublishedDate(date);
                        } else {
                            date = metadata.getDate(Metadata.ORIGINAL_DATE);
                            if (null != date) {
                                doc.setPublishedDate(date);
                            }
                        }
                    }
                } catch (Exception e) { // Fine just carry on                  
                }
                //TESTED

                // If the metadata contains a geotag then apply that:
                try {
                    String lat = metadata.get(Metadata.LATITUDE);
                    String lon = metadata.get(Metadata.LONGITUDE);
                    if ((null != lat) && (null != lon)) {
                        GeoPojo gt = new GeoPojo();
                        gt.lat = Double.parseDouble(lat);
                        gt.lon = Double.parseDouble(lon);
                        doc.setDocGeo(gt);
                    }
                } catch (Exception e) { // Fine just carry on                  
                }

                // Save the entire metadata:
                doc.addToMetadata("_FILE_METADATA_", metadata);

                for (ObjectId communityId : source.getCommunityIds()) {
                    doc.setCommunityId(communityId);
                }
                files.add(doc);

                // Close the input stream
                in.close();
                in = null;

                //TESTED

            } catch (SmbException e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } catch (MalformedURLException e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } catch (UnknownHostException e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } catch (IOException e) {
                errors++;
                _context.getHarvestStatus().logMessage(e.getMessage(), true);
            } catch (TikaException e) {
                errors++;
                _context.getHarvestStatus().logMessage(e.getMessage(), true);
            } catch (Exception e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } finally { // Close the input stream if an error occurs
                if (null != in) {
                    try {
                        in.close();
                    } catch (IOException e) {
                        // All good, do nothing
                    }
                }
            } // end exception handling
        } // end dedup check
    } // end XML vs "office" app

    //DEBUG
    //System.out.println("FILE=" + files.size() + " / MEM=" + _memUsage + " VS " + Runtime.getRuntime().totalMemory());
}

From source file:com.clustercontrol.agent.winevent.WinEventMonitor.java

/**
 * XMLStAX???EventLogRecord????/*  w w  w.  j a v  a2s .co m*/
 * @param eventXmlStream
 * @return EventLogRecord?
 */
private ArrayList<EventLogRecord> parseEventXML(InputStream eventXmlStream) {
    ArrayList<EventLogRecord> eventlogs = new ArrayList<EventLogRecord>();

    try {
        XMLInputFactory xmlif = XMLInputFactory.newInstance();
        /**
         * OpenJDK7/OracleJDK7??"]"?2?????????????????????????????
         * ?XML?????????OpenJDK7/OracleJDK7???????/??????????
         * URL???????????????
         * 
         * URL
         * http://docs.oracle.com/javase/jp/6/api/javax/xml/stream/XMLStreamReader.html#next()
         */
        String xmlCoalescingKey = "javax.xml.stream.isCoalescing";// TODO JRE???????????????????
        if (m_log.isDebugEnabled()) {
            m_log.debug(xmlCoalescingKey + " = true");
        }
        xmlif.setProperty(xmlCoalescingKey, true);
        XMLStreamReader xmlr = xmlif.createXMLStreamReader(eventXmlStream);

        while (xmlr.hasNext()) {
            switch (xmlr.getEventType()) {
            case XMLStreamConstants.START_ELEMENT:
                m_log.trace("EventType : XMLStreamConstants.START_ELEMENT");

                String localName = xmlr.getLocalName();
                m_log.trace("local name : " + localName);

                if ("Event".equals(localName)) {
                    EventLogRecord eventlog = new EventLogRecord();
                    eventlogs.add(eventlog);
                    m_log.debug("create new EventLogRecord");
                } else {
                    String attrLocalName = null;
                    String attrValue = null;

                    if (xmlr.getAttributeCount() != 0) {
                        attrLocalName = xmlr.getAttributeLocalName(0);
                        attrValue = xmlr.getAttributeValue(0);
                        m_log.trace("attribute local name : " + attrLocalName);
                        m_log.trace("attribute local value : " + attrValue);
                    }

                    if ("Provider".equals(localName)) {
                        if ("Name".equals(attrLocalName)) {
                            m_log.trace("target value : " + attrValue);

                            EventLogRecord eventlog = eventlogs.get(eventlogs.size() - 1);
                            eventlog.setProviderName(attrValue);
                            m_log.debug("set ProviderName : " + eventlog.getProviderName());
                        }
                    }
                    // Get-WinEvent/wevtutil.exe
                    else if ("TimeCreated".equals(localName) && "SystemTime".equals(attrLocalName)) {
                        m_log.trace("target value : " + attrValue);

                        // "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS'Z'"???S????????????
                        String formatedDateString = attrValue.replaceAll("\\..*Z", "");
                        m_log.trace("formatted target value : " + formatedDateString);
                        DateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
                        sdf.setTimeZone(TimeZone.getTimeZone("UTC"));

                        EventLogRecord eventlog = eventlogs.get(eventlogs.size() - 1);
                        ;
                        try {
                            eventlog.setTimeCreated(sdf.parse(formatedDateString));
                        } catch (ParseException e) {
                            // do nothing
                            m_log.error("set TimeCreated Error", e);
                        }
                        m_log.debug("set TimeCreated : " + eventlog.getTimeCreated());
                    }
                    // Get-EventLog
                    if ("TimeGenerated".equals(localName) && "SystemTime".equals(attrLocalName)) {
                        m_log.trace("target value : " + attrValue);
                        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss'Z'");
                        sdf.setTimeZone(HinemosTime.getTimeZone());

                        EventLogRecord eventlog = eventlogs.get(eventlogs.size() - 1);
                        ;
                        try {
                            eventlog.setTimeCreated(sdf.parse(attrValue));
                        } catch (ParseException e) {
                            // do nothing
                            m_log.error("set TimeCreated Error", e);
                        }
                        m_log.debug("set TimeCreated : " + eventlog.getTimeCreated());
                    } else {
                        targetProperty = localName;
                        m_log.trace("target property : " + targetProperty);
                    }
                }

                break;
            case XMLStreamConstants.SPACE:
            case XMLStreamConstants.CHARACTERS:
                m_log.trace("EventType : XMLStreamConstants.CHARACTERS, length=" + xmlr.getTextLength());
                if (targetProperty != null) {
                    try {
                        EventLogRecord eventlog = eventlogs.get(eventlogs.size() - 1);
                        ;
                        if ("EventID".equals(targetProperty)) {
                            eventlog.setId(Integer.parseInt(new String(xmlr.getTextCharacters(),
                                    xmlr.getTextStart(), xmlr.getTextLength())));
                            m_log.debug("set EventID : " + eventlog.getId());
                        }
                        // Get-WinEvent/wevtutil.exe
                        else if ("Level".equals(targetProperty)) {
                            if (eventlog.getLevel() == WinEventConstant.UNDEFINED) {
                                eventlog.setLevel(Integer.parseInt(new String(xmlr.getTextCharacters(),
                                        xmlr.getTextStart(), xmlr.getTextLength())));
                                m_log.debug("set Level : " + eventlog.getLevel());
                            }
                        } else if ("Task".equals(targetProperty)) {
                            if (eventlog.getTask() == WinEventConstant.UNDEFINED) {
                                eventlog.setTask(Integer.parseInt(new String(xmlr.getTextCharacters(),
                                        xmlr.getTextStart(), xmlr.getTextLength())));
                                m_log.debug("set Task : " + eventlog.getTask());
                            }
                        } else if ("Keywords".equals(targetProperty)) {
                            // TODO ????????0x8080000000000000
                            //eventlog.setKeywords(Long.decode(new String(xmlr.getTextCharacters(), xmlr.getTextStart(), xmlr.getTextLength())));
                            //m_log.debug("set Keywords : " + eventlog.getKeywords());
                        } else if ("EventRecordId".equals(targetProperty)) {
                            eventlog.setRecordId(Long.parseLong(new String(xmlr.getTextCharacters(),
                                    xmlr.getTextStart(), xmlr.getTextLength())));
                            m_log.debug("set RecordId : " + eventlog.getRecordId());
                        } else if ("Channel".equals(targetProperty)) {
                            eventlog.setLogName(new String(xmlr.getTextCharacters(), xmlr.getTextStart(),
                                    xmlr.getTextLength()));
                            m_log.debug("set LogName : " + eventlog.getLogName());
                        } else if ("Computer".equals(targetProperty)) {
                            eventlog.setMachineName(new String(xmlr.getTextCharacters(), xmlr.getTextStart(),
                                    xmlr.getTextLength()));
                            m_log.debug("set MachineName : " + eventlog.getMachineName());
                        } else if ("Message".equals(targetProperty)) {
                            String message = new String(xmlr.getTextCharacters(), xmlr.getTextStart(),
                                    xmlr.getTextLength());
                            message = message.replaceAll(tmpReturnCode, "\r\n");
                            message = message.replaceAll(tmpLtCode, "<");
                            message = message.replaceAll(tmpGtCode, ">");
                            eventlog.setMessage(message);
                            m_log.debug("set Message : " + eventlog.getMessage());
                        } else if ("Data".equals(targetProperty)) {
                            String data = new String(xmlr.getTextCharacters(), xmlr.getTextStart(),
                                    xmlr.getTextLength());
                            eventlog.getData().add(data);
                            m_log.debug("set Data : " + data);
                        } else {
                            m_log.debug("unknown target property : " + targetProperty);
                        }
                    } catch (NumberFormatException e) {
                        m_log.debug("number parse error", e);
                    }
                }
                targetProperty = null;
                break;
            default: // 
                break;
            }
            xmlr.next();
        }
        xmlr.close();
    } catch (XMLStreamException e) {
        m_log.warn("parseEvent() xmlstream error", e);
    }

    return eventlogs;

}

From source file:edu.harvard.iq.safe.lockss.impl.LOCKSSDaemonStatusTableXmlStreamParser.java

/**
 *
 * @param stream//from ww  w . jav a 2 s  . com
 * @param encoding
 */
@Override
public void read(InputStream stream, String encoding) {
    // logger.setLevel(Level.FINE);
    // 1. create Input factory
    XMLInputFactory xmlif = XMLInputFactory.newInstance();
    xmlif.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE);
    xmlif.setProperty("javax.xml.stream.isNamespaceAware", java.lang.Boolean.TRUE);

    long startTime = System.currentTimeMillis();

    int noAUs = 0;
    String aus = null;
    String currentTableId = null;
    String currentTableTitle = null;
    String currentTableKey = null;
    boolean hasErrorsColumn = false;
    String siAuId = null;
    XMLStreamReader xmlr = null;

    try {

        // create reader
        xmlr = xmlif.createXMLStreamReader(new BufferedInputStream(stream), encoding);

        String curElement = "";

        boolean isLastTagnameTable = false;
        String targetTagName = "row";
        String cellTagName = "columnname";
        boolean withinSummaryinfo = false;
        boolean withinColumndescriptor = false;
        boolean withinRow = false;
        boolean withinCell = false;
        boolean withinReference = false;
        boolean isCrawlStatusActive = false;
        boolean isCrawlStatusColumn = false;
        int valueTagCounter = 0;
        String currentColumnName = null;
        String currentCellValue = null;
        String currentCellKey = null;
        SummaryInfo si = null;

        List<String> rowData = null;
        Map<String, String> rowDataH = null;

        w1: while (xmlr.hasNext()) {
            int eventType = xmlr.next();
            switch (eventType) {
            case XMLStreamConstants.START_ELEMENT:
                curElement = xmlr.getLocalName(); // note: getName() ->
                // QName
                logger.log(Level.FINE, "--------- start tag = <{0}> ---------", curElement);
                // check the table name first
                if (curElement.equals("table")) {
                    isLastTagnameTable = true;
                } else if (curElement.equals("error")) {
                    isTargetPageValid = false;
                    break w1;
                }

                if (isLastTagnameTable) {
                    if (curElement.equals("name")) {
                        currentTableId = xmlr.getElementText();
                        logger.log(Level.FINE, "########### table Id = [{0}] ###########", currentTableId);
                        tableId = currentTableId;
                        if (belongsInclusionTableList.contains(currentTableId)) {
                            logger.log(Level.FINE, "!!!!! Table ({0}) belongs to the target list !!!!!",
                                    currentTableId);

                        } else {
                            logger.log(Level.FINE,
                                    "XXXXXXXXXXX Table ({0}) does not belong to the target list XXXXXXXXXXX",
                                    currentTableId);
                            break w1;
                        }
                    } else if (curElement.equals("key")) {
                        currentTableKey = xmlr.getElementText();
                        logger.log(Level.FINE, "---------- table key = ({0}) ----------", currentTableKey);
                        tableKey = currentTableKey;
                    } else if (curElement.equals("title")) {
                        currentTableTitle = xmlr.getElementText();
                        logger.log(Level.FINE, "+++++++++ table Title = ({0}) +++++++++", currentTableTitle);
                        if (currentTableId.equals("PeerRepair")) {
                            if (currentTableTitle.startsWith("Repair candidates for AU: ")) {
                                currentTableTitle = currentTableTitle.replaceFirst("Repair candidates for AU: ",
                                        "");
                                logger.log(Level.FINE, "save this modified table-Title as auName={0}",
                                        currentTableTitle);
                                this.tableTitle = currentTableTitle;
                            } else {
                                logger.log(Level.WARNING,
                                        "The table-Title does not start with the expected token={0}",
                                        currentTableTitle);
                            }
                        }
                        isLastTagnameTable = false;
                    }
                }

                if (curElement.equals("columndescriptor")) {
                    withinColumndescriptor = true;
                } else if (curElement.equals("row")) {
                    withinRow = true;
                    rowCounter++;
                    logger.log(Level.FINE, "================== {0}-th row starts here ==================",
                            rowCounter);
                    // set-up the table storage
                    //if (rowCounter == 1) {
                    // 1st row
                    rowData = new ArrayList<String>();
                    rowDataH = new LinkedHashMap<String, String>();
                    //}
                } else if (curElement.equals("cell")) {
                    logger.log(Level.FINE, "entering a cell");
                    withinCell = true;
                } else if (curElement.equals("reference")) {
                    withinReference = true;
                    logger.log(Level.FINE, "within reference on");
                } else if (curElement.equals("summaryinfo")) {
                    withinSummaryinfo = true;
                    si = new SummaryInfo();
                } else if (curElement.equals("value")) {
                    logger.log(Level.FINE, "entering a value");
                    valueTagCounter++;
                }
                //---- columndescriptor tag ---------------------------------------------------
                if (withinColumndescriptor) {
                    if (curElement.equals("name")) {

                        String nameText = xmlr.getElementText();
                        logger.log(Level.FINE, "\tcolumndescriptor: name = {0}", nameText);
                        columndescriptorList.add(nameText);
                    } else if (curElement.equals("title")) {
                        String titleText = xmlr.getElementText();
                        logger.log(Level.FINE, "\tcolumndescriptor: title = {0}", titleText);
                    } else if (curElement.equals("type")) {
                        String typeText = xmlr.getElementText();
                        logger.log(Level.FINE, "\tcolumndescriptor: type = {0}", typeText);
                        getTypeList().add(typeText);
                    }
                }
                //---- cell tag ----------------------------------------------------------------
                if (withinCell) {
                    logger.log(Level.FINE, "parsing withinCell");
                    if (curElement.equals("columnname")) {

                        String columnname = xmlr.getElementText();
                        logger.log(Level.FINE, "\t\tcolumnname = {0}", columnname);
                        currentColumnName = columnname;
                        if (columnname.equals("crawl_status")) {
                            isCrawlStatusColumn = true;
                        } else {
                            isCrawlStatusColumn = false;
                        }

                        if (columnname.equals("Errors")) {
                            hasErrorsColumn = true;
                        }

                    } else {
                        // value tag block: either value-tag WO a child element
                        // or with a child element
                        /*
                         * <value><reference>...<value>xxxx</value>
                         * <value>xxxx</value>
                         */
                        if ((curElement.equals("value")) && (!withinReference)) {
                            logger.log(Level.FINE, "entering el:value/WO-REF block");
                            if (!hasReferenceTag.contains(currentColumnName)) {
                                logger.log(Level.FINE, "No child reference tag is expected for this value tag");
                                logger.log(Level.FINEST, "xmlr.getEventType():pre-parsing={0}",
                                        xmlr.getEventType());
                                String cellValue = xmlr.getElementText();
                                // note: the above parsing action moves the
                                // cursor to the end-tag, i.e., </value>
                                // therefore, the end-element-switch-block below
                                // cannot catch this </value> tag

                                logger.log(Level.FINE, "\t\t\t[No ref: value] {0} = {1}",
                                        new Object[] { currentColumnName, cellValue });

                                currentCellValue = cellValue;
                                logger.log(Level.FINEST, "xmlr.getEventType():post-parsing={0}",
                                        xmlr.getEventType());
                                // store this value
                                // rowData
                                logger.log(Level.FINE, "current column name={0}", currentColumnName);
                                logger.log(Level.FINE, "valueTagCounter={0}", valueTagCounter);
                                if (currentColumnName.endsWith("Damaged")) {
                                    if (valueTagCounter <= 1) {
                                        // 2nd value tag is footnot for this column
                                        // ignore this value
                                        rowData.add(cellValue);
                                        rowDataH.put(currentColumnName, currentCellValue);
                                    }
                                } else {
                                    rowData.add(cellValue);
                                    rowDataH.put(currentColumnName, currentCellValue);
                                }
                            } else {
                                // previously this block was unthinkable, but
                                // it was found that there are columns that
                                // temporarily have a <reference> tag in
                                // crawl_status_table; these columns are
                                // included in hasReferenceTag by default;
                                // thus, for such unstable columns,
                                // when they hava a <reference tag,
                                // data are caputred in another within-
                                // reference block; however, when these
                                // columns no longer have <reference> tag,
                                // text data would be left uncaptured unless
                                // some follow-up processing takes place here
                                logger.log(Level.FINE, "May have to capture data: column={0}",
                                        currentColumnName);
                                if (mayHaveReferenceTag.contains(currentColumnName) && !isCrawlStatusActive) {
                                    // because the crawling is not active,
                                    // it is safely assume that the maybe columns have no reference tag

                                    // 2011-10-24 the above assumption was found wrong
                                    // a crawling cell does not say active but
                                    // subsequent columns have a reference
                                    logger.log(Level.FINE,
                                            "a text or a reference tag : try to parse it as a text");
                                    String cellValue = null;
                                    try {
                                        cellValue = xmlr.getElementText();
                                    } catch (javax.xml.stream.XMLStreamException ex) {
                                        continue;
                                    } finally {
                                    }
                                    logger.log(Level.FINE, "\t\t\t[value WO-ref(crawling_NOT_active case)={0}]",
                                            currentColumnName + " = " + cellValue);
                                    currentCellValue = cellValue;
                                    // store this value
                                    // rowData
                                    logger.log(Level.FINE, "\t\t\tcurrent columnName={0}", currentColumnName);
                                    rowData.add(cellValue);
                                    rowDataH.put(currentColumnName, currentCellValue);

                                } else {
                                    logger.log(Level.FINE, "WO-Ref: no processing items now:{0}", curElement);
                                }
                            }
                        } else if (withinReference) {
                            // reference tag exists
                            logger.log(Level.FINE, "WR:curElement={0}", curElement);

                            if (curElement.equals("key")) {
                                String cellKey = xmlr.getElementText();
                                logger.log(Level.FINE, "\t\tcurrentCellKey is set to={0}", cellKey);
                                currentCellKey = cellKey;
                            } else if (curElement.equals("value")) {
                                String cellValue = xmlr.getElementText();

                                logger.log(Level.FINE, "\t\twr: {0} = {1}",
                                        new Object[] { currentColumnName, cellValue });

                                // exception cases follow:
                                if (currentColumnName.equals("AuName")) {
                                    logger.log(Level.FINE, "\t\tAuName is replaced with the key[=AuId]= {0}",
                                            currentCellKey);
                                    // rowData                                  // This block is for ArchivalUnitStatusTable
                                    // add the key as a new datum (auId)
                                    // ahead of its value
                                    rowData.add(currentCellKey);
                                    rowDataH.put("AuId", currentCellKey);
                                    currentCellValue = cellValue;
                                } else if (currentColumnName.equals("auId")) {
                                    // This block is for V3PollerTable
                                    logger.log(Level.FINE, "\t\tnew value for auId(V3PollerTable)={0}",
                                            currentCellKey);
                                    // deprecated after 2012-02-02: use key as data
                                    // currentCellValue = currentCellKey;
                                    // add auName as a new column ahead of auId

                                    rowData.add(cellValue);
                                    rowDataH.put("auName", cellValue);
                                    logger.log(Level.FINE, "\t\tauName(V3PollerTable)={0}", cellValue);

                                    currentCellValue = currentCellKey;
                                } else if (currentColumnName.equals("pollId")) {
                                    // this block is for V3PollerTable
                                    logger.log(Level.FINE, "\t\tFull string (key) is used={0}", currentCellKey);
                                    // The key has the complete string whereas
                                    // the value is its truncated copy
                                    currentCellValue = currentCellKey;

                                } else if (currentColumnName.equals("au")) {
                                    logger.log(Level.FINE,
                                            "\t\tauId is used instead for au(crawl_status_table)={0}",
                                            currentCellKey);

                                    // 2012-02-02: add auName ahead of au
                                    rowData.add(cellValue);
                                    rowDataH.put("auName", cellValue);
                                    logger.log(Level.FINE, "\t\tauName={0}", cellValue);

                                    // rowData                                  // This block is for crawl_status_table
                                    // save the key(auId) instead of value
                                    currentCellValue = currentCellKey;

                                } else if (currentColumnName.equals("Peers")) {

                                    logger.log(Level.FINE, "\t\tURL (key) is used={0}", currentCellKey);
                                    currentCellValue = DaemonStatusDataUtil.escapeHtml(currentCellKey);
                                    logger.log(Level.FINE, "\t\tAfter encoding ={0}", currentCellValue);

                                } else {
                                    if (isCrawlStatusColumn) {
                                        // if the craw status column is
                                        // "active", some later columns
                                        // may have a reference tag
                                        // so turn on the switch
                                        if (cellValue.equals("Active") || (cellValue.equals("Pending"))) {
                                            isCrawlStatusActive = true;
                                        } else {
                                            isCrawlStatusActive = false;
                                        }
                                    }
                                    // the default processing
                                    currentCellValue = cellValue;
                                }
                                // store currentCellValue
                                logger.log(Level.FINE, "currentCellValue={0}", currentCellValue);
                                // rowData
                                rowData.add(currentCellValue);
                                rowDataH.put(currentColumnName, currentCellValue);
                            } // Within ref tag: key and valu processing
                        } // value with text or value with ref tag
                    } // columnname or value
                } // within cell
                // ---- summaryinfo tag --------------------------------------------------------
                if (withinSummaryinfo) {
                    logger.log(Level.FINE,
                            "============================ Within SummaryInfo ============================ ");
                    if (curElement.equals("title")) {
                        String text = xmlr.getElementText();
                        si.setTitle(text);

                        logger.log(Level.FINE, "\tsi:titile={0}", si.getTitle());
                    } else if (curElement.equals("type")) {
                        String text = xmlr.getElementText();
                        si.setType(Integer.parseInt(text));
                        logger.log(Level.FINE, "\tsi:type={0}", si.getType());
                    } else if (curElement.equals("key")) {
                        if (withinReference && si.getTitle().equals("Volume")) {
                            String text = xmlr.getElementText();
                            logger.log(Level.FINE, "\tsi:key contents(Volume case)={0}", text);
                            siAuId = text;
                            //                                    si.setValue(text);
                            logger.log(Level.FINE, "\tsi:value(Volume case)={0}", siAuId);
                        }
                    } else if (curElement.equals("value")) {
                        if (withinReference) {
                            if (hasRefTitileTagsSI.contains(si.getTitle())) {
                                if (si.getTitle().equals("Volume")) {
                                    // 2012-02-02 use the au name
                                    String text = xmlr.getElementText();
                                    si.setValue(text);
                                    logger.log(Level.FINE, "\tsi:value(Volume case)={0}", si.getValue());
                                } else {
                                    String text = xmlr.getElementText();
                                    si.setValue(text);
                                    logger.log(Level.FINE, "\tsi:value={0}", si.getValue());
                                }
                            }
                        } else {
                            // note: 2012-02-07
                            // daemon 1.59.2 uses the new layout for AU page
                            // this layout includes a summaryinfo tag
                            // that now contains a reference tag
                            String text = null;

                            try {
                                text = xmlr.getElementText();
                                if (!hasRefTitileTagsSI.contains(si.getTitle())) {
                                    si.setValue(text);
                                    logger.log(Level.FINE, "\tsi:value={0}", si.getValue());
                                }
                            } catch (javax.xml.stream.XMLStreamException ex) {
                                logger.log(Level.WARNING, "encounter a reference tag rather than text");
                                continue;
                            } finally {
                            }
                        }
                    }

                    /*
                     * aus = xmlr.getElementText();
                     * out.println("found token=[" + aus + "]"); if
                     * (currentTableId.equals("ArchivalUnitStatusTable")) {
                     * m = pau.matcher(aus); if (m.find()) {
                     * out.println("How many AUs=" + m.group(1)); noAUs =
                     * Integer.parseInt(m.group(1)); } else {
                     * out.println("not found within[" + aus + "]"); } }
                     */
                }

                break;
            case XMLStreamConstants.CHARACTERS:
                break;

            case XMLStreamConstants.ATTRIBUTE:
                break;

            case XMLStreamConstants.END_ELEMENT:
                if (xmlr.getLocalName().equals("columndescriptor")) {
                    withinColumndescriptor = false;
                    logger.log(Level.FINE, "leaving columndescriptor");
                } else if (xmlr.getLocalName().equals("row")) {
                    if (withinRow) {
                        logger.log(Level.FINE, "========= end of the target row element");
                        withinRow = false;
                    }
                    if (!isCrawlStatusActive) {
                        tabularData.add(rowData);
                        tableData.add(rowDataH);

                    } else {
                        rowIgnored++;
                        rowCounter--;
                    }
                    rowData = null;
                    rowDataH = null;
                    isCrawlStatusActive = false;
                } else if (xmlr.getLocalName().equals("cell")) {
                    // rowDataH.add(cellDatum);
                    cellCounter++;
                    withinCell = false;
                    currentColumnName = null;
                    currentCellValue = null;
                    currentCellKey = null;
                    isCrawlStatusColumn = false;
                    valueTagCounter = 0;
                    logger.log(Level.FINE, "leaving cell");
                } else if (xmlr.getLocalName().equals("columnname")) {
                    logger.log(Level.FINE, "leaving columnname");
                } else if (xmlr.getLocalName().equals("reference")) {
                    withinReference = false;
                } else if (xmlr.getLocalName().equals("summaryinfo")) {
                    logger.log(Level.FINE, "si={0}", si.toString());
                    summaryInfoList.add(si);
                    si = null;
                    withinSummaryinfo = false;
                } else if (xmlr.getLocalName().equals("value")) {
                    logger.log(Level.FINE, "leaving value");
                } else {
                    logger.log(Level.FINE, "--------- end tag = <{0}> ---------", curElement);
                }

                break;
            case XMLStreamConstants.END_DOCUMENT:
                logger.log(Level.FINE, "Total of {0} row occurrences", rowCounter);
            } // end: switch
        } // end:while
    } catch (XMLStreamException ex) {
        logger.log(Level.WARNING, "XMLStreamException occurs", ex);
        this.isTargetPageValid = false;

    } catch (RuntimeException re) {
        logger.log(Level.WARNING, "some RuntimeException occurs", re);
        this.isTargetPageValid = false;
    } catch (Exception e) {
        logger.log(Level.WARNING, "some Exception occurs", e);
        this.isTargetPageValid = false;
    } finally {
        // 5. close reader/IO
        if (xmlr != null) {
            try {
                xmlr.close();
            } catch (XMLStreamException ex) {
                logger.log(Level.WARNING, "XMLStreamException occurs during close()", ex);
            }
        }
        if (!this.isTargetPageValid) {
            logger.log(Level.WARNING,
                    "This parsing session may not be complete due to some exception reported earlier");
        }
    } // end of try

    if (currentTableId.equals("V3PollerDetailTable")) {
        summaryInfoList.add(new SummaryInfo("auId", 4, siAuId));
        summaryInfoMap = new LinkedHashMap<String, String>();
        for (SummaryInfo si : summaryInfoList) {
            summaryInfoMap.put(si.getTitle(), si.getValue());
        }
    }

    // parsing summary
    logger.log(Level.FINE, "###################### parsing summary ######################");
    logger.log(Level.FINE, "currentTableId={0}", currentTableId);
    logger.log(Level.FINE, "currentTableTitle={0}", currentTableTitle);
    logger.log(Level.FINE, "currentTableKey={0}", currentTableKey);

    logger.log(Level.FINE, "columndescriptorList={0}", columndescriptorList);
    logger.log(Level.FINE, "# of columndescriptors={0}", columndescriptorList.size());
    logger.log(Level.FINE, "typeList={0}", typeList);
    logger.log(Level.FINE, "# of rows counted={0}", rowCounter);
    logger.log(Level.FINE, "# of rows excluded[active ones are excluded]={0}", rowIgnored);
    logger.log(Level.FINE, "summaryInfoList:size={0}", summaryInfoList.size());
    logger.log(Level.FINE, "summaryInfoList={0}", summaryInfoList);
    logger.log(Level.FINE, "table: cell counts = {0}", cellCounter);
    logger.log(Level.FINE, "tableData[map]=\n{0}", tableData);
    logger.log(Level.FINE, "tabularData[list]=\n{0}", tabularData);

    /*
     * if (currentTableId.equals("ArchivalUnitStatusTable")) { if
     * (rowCounter == noAUs) { out.println("au counting is OK=" +
     * rowCounter); } else { err.println("au counting disagreement"); throw
     * new RuntimeException("parsing error is suspected"); } }
     */
    logger.log(Level.FINE, " completed in {0} ms\n\n", (System.currentTimeMillis() - startTime));

    if (!columndescriptorList.isEmpty()) {
        int noCols = columndescriptorList.size();
        if (currentTableId.equals("V3PollerTable") && !hasErrorsColumn) {
            noCols--;
        }
        int noCellsExpd = rowCounter * noCols;
        if (noCols > 0) {
            // this table has a table
            logger.log(Level.FINE, "checking parsing results: table dimmensions");
            if (noCellsExpd == cellCounter) {
                logger.log(Level.FINE, "table dimensions and cell-count are consistent");
            } else {
                int diff = noCellsExpd - cellCounter;
                logger.log(Level.FINE, "The table has {0} incomplete cells", diff);
                hasIncompleteRows = true;
                setIncompleteRowList();
                logger.log(Level.FINE, "incomplete rows: {0}", incompleteRows);
            }
        }
    }
}

From source file:com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester.java

/**
 * processMeta - handle an individual field
 *//*from   w  w  w .ja  v a2 s. co  m*/
private void processMeta(DocumentPojo f, metaField m, String text, SourcePojo source,
        UnstructuredAnalysisConfigPojo uap) {

    boolean bAllowDuplicates = false;
    if ((null != m.flags) && m.flags.contains("U")) {
        bAllowDuplicates = true;
    }
    if ((null == m.scriptlang) || m.scriptlang.equalsIgnoreCase("regex")) {

        Pattern metaPattern = createRegex(m.script, m.flags);

        int timesToRun = 1;
        Object[] currField = null;
        if ((null != m.flags) && m.flags.contains("c")) {
            currField = f.getMetadata().get(m.fieldName);
        }
        if (null != currField) { // chained metadata
            timesToRun = currField.length;
            text = (String) currField[0];
        } //TESTED

        Matcher matcher = metaPattern.matcher(text);
        LinkedList<String> Llist = null;

        for (int ii = 0; ii < timesToRun; ++ii) {
            if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
                text = (String) currField[ii];
                matcher = metaPattern.matcher(text);
            } //TESTED

            StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
            int nFieldNameLen = m.fieldName.length() + 1;

            try {
                while (matcher.find()) {
                    if (null == Llist) {
                        Llist = new LinkedList<String>();
                    }
                    if (null == m.groupNum) {
                        m.groupNum = 0;
                    }
                    String toAdd = matcher.group(m.groupNum);
                    if (null != m.replace) {
                        toAdd = metaPattern.matcher(toAdd).replaceFirst(m.replace);
                    }
                    if ((null != m.flags) && m.flags.contains("H")) {
                        toAdd = StringEscapeUtils.unescapeHtml(toAdd);
                    }
                    prefix.setLength(nFieldNameLen);
                    prefix.append(toAdd);
                    String dupCheck = prefix.toString();

                    if (!regexDuplicates.contains(dupCheck)) {
                        Llist.add(toAdd);
                        if (!bAllowDuplicates) {
                            regexDuplicates.add(dupCheck);
                        }
                    }
                }
            } catch (Exception e) {
                this._context.getHarvestStatus().logMessage("processMeta1: " + e.getMessage(), true);
            }
        } //(end metadata chaining handling)
        if (null != Llist) {
            if (null != currField) { // (overwrite)
                f.getMetadata().put(m.fieldName, Llist.toArray());
            } else {
                f.addToMetadata(m.fieldName, Llist.toArray());
            }
        } //TESTED
    } else if (m.scriptlang.equalsIgnoreCase("javascript")) {
        if (null == f.getMetadata()) {
            f.setMetadata(new LinkedHashMap<String, Object[]>());
        }
        //set the script engine up if necessary
        if ((null != source) && (null != uap)) {
            //(these are null if called from new processing pipeline vs legacy code)
            intializeScriptEngine(source, uap);
        }

        try {
            //TODO (INF-2488): in new format, this should only happen in between contentMeta blocks/docs
            // (also should be able to use SAH _document object I think?)

            // Javascript: the user passes in 
            Object[] currField = f.getMetadata().get(m.fieldName);
            if ((null == m.flags) || m.flags.isEmpty()) {
                if (null == currField) {
                    engine.put("text", text);
                    engine.put("_iterator", null);
                }
                //(otherwise will just pass the current fields in there)
            } else { // flags specified
                if (m.flags.contains("t")) { // text
                    engine.put("text", text);
                }
                if (m.flags.contains("d")) { // entire document (minus ents and assocs)
                    GsonBuilder gb = new GsonBuilder();
                    Gson g = gb.create();
                    List<EntityPojo> ents = f.getEntities();
                    List<AssociationPojo> assocs = f.getAssociations();
                    try {
                        f.setEntities(null);
                        f.setAssociations(null);
                        engine.put("document", g.toJson(f));
                        securityManager.eval(engine, JavaScriptUtils.initScript);
                    } finally {
                        f.setEntities(ents);
                        f.setAssociations(assocs);
                    }
                }
                if (m.flags.contains("m")) { // metadata
                    GsonBuilder gb = new GsonBuilder();
                    Gson g = gb.create();
                    engine.put("_metadata", g.toJson(f.getMetadata()));
                    securityManager.eval(engine, JavaScriptUtils.iteratorMetaScript);
                }
            } //(end flags processing)

            if (null != currField) {
                f.getMetadata().remove(m.fieldName);

                GsonBuilder gb = new GsonBuilder();
                Gson g = gb.create();
                engine.put("_iterator", g.toJson(currField));
                securityManager.eval(engine, JavaScriptUtils.iteratorDocScript);
            }
            //TESTED (handling of flags, and replacing of existing fields, including when field is null but specified)

            Object returnVal = securityManager.eval(engine, m.script);

            if (null != returnVal) {
                if (returnVal instanceof String) { // The only easy case
                    Object[] array = new Object[1];
                    if ((null != m.flags) && m.flags.contains("H")) {
                        returnVal = StringEscapeUtils.unescapeHtml((String) returnVal);
                    }
                    array[0] = returnVal;
                    f.addToMetadata(m.fieldName, array);
                } else { // complex object or array - in either case the engine turns these into
                         // internal.NativeArray or internal.NativeObject

                    BasicDBList outList = JavaScriptUtils.parseNativeJsObject(returnVal, engine);
                    f.addToMetadata(m.fieldName, outList.toArray());
                }
            }
        } catch (ScriptException e) {

            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(),
                    true);

            // Just do nothing and log
            // e.printStackTrace();
            //DEBUG (don't output log messages per doc)
            //logger.error(e.getMessage());
        } catch (Exception e) {

            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(),
                    true);

            // Just do nothing and log
            // e.printStackTrace();
            //DEBUG (don't output log messages per doc)
            //logger.error(e.getMessage());
        }
    } else if (m.scriptlang.equalsIgnoreCase("xpath")) {

        String xpath = m.script;

        try {
            createHtmlCleanerIfNeeded();

            int timesToRun = 1;
            Object[] currField = null;
            if ((null != m.flags) && m.flags.contains("c")) {
                currField = f.getMetadata().get(m.fieldName);
            }
            if (null != currField) { // chained metadata
                f.getMetadata().remove(m.fieldName); // (so will add to the end)
                timesToRun = currField.length;
                text = (String) currField[0];
            } //TESTED

            for (int ii = 0; ii < timesToRun; ++ii) {
                if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
                    text = (String) currField[ii];
                } //TESTED

                TagNode node = cleaner.clean(new ByteArrayInputStream(text.getBytes()));

                //NewCode : Only use html cleaner for cleansing
                //use JAXP for full Xpath lib
                Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);

                String extraRegex = extractRegexFromXpath(xpath);

                if (extraRegex != null)
                    xpath = xpath.replace(extraRegex, "");

                XPath xpa = XPathFactory.newInstance().newXPath();
                NodeList res = (NodeList) xpa.evaluate(xpath, doc, XPathConstants.NODESET);

                if (res.getLength() > 0) {
                    if ((null != m.flags) && (m.flags.contains("o"))) { // "o" for object
                        m.groupNum = -1; // (see bConvertToObject below)
                    }
                    StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
                    int nFieldNameLen = m.fieldName.length() + 1;
                    ArrayList<Object> Llist = new ArrayList<Object>(res.getLength());
                    boolean bConvertToObject = ((m.groupNum != null) && (m.groupNum == -1));
                    boolean convertToXml = ((null != m.flags) && (m.flags.contains("x")));
                    for (int i = 0; i < res.getLength(); i++) {
                        Node info_node = res.item(i);
                        if ((null != m.flags) && (m.flags.contains("g"))) {
                            Llist.add(parseHtmlTable(info_node, m.replace));
                        } else if (bConvertToObject || convertToXml) {
                            // Try to create a JSON object out of this
                            StringWriter writer = new StringWriter();
                            try {
                                Transformer transformer = TransformerFactory.newInstance().newTransformer();
                                transformer.transform(new DOMSource(info_node), new StreamResult(writer));
                            } catch (TransformerException e1) {
                                continue;
                            }

                            if (bConvertToObject) {
                                try {
                                    JSONObject subObj = XML.toJSONObject(writer.toString());
                                    if (xpath.endsWith("*")) { // (can have any number of different names here)
                                        Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj));
                                    } //TESTED
                                    else {
                                        String[] rootNames = JSONObject.getNames(subObj);
                                        if (1 == rootNames.length) {
                                            // (don't think it can't be any other number in fact)
                                            subObj = subObj.getJSONObject(rootNames[0]);
                                        }
                                        boolean bUnescapeHtml = ((null != m.flags) && m.flags.contains("H"));
                                        Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj,
                                                bUnescapeHtml));
                                    } //TESTED
                                } catch (JSONException e) { // Just carry on
                                    continue;
                                }
                                //TESTED
                            } else { // leave in XML form
                                Llist.add(writer.toString().substring(38)); // +38: (step over <?xml version="1.0" encoding="UTF-8"?>)
                            } //TESTED (xpath_test.json)
                        } else { // Treat this as string, either directly or via regex
                            String info = info_node.getTextContent().trim();
                            if (extraRegex == null || extraRegex.isEmpty()) {
                                prefix.setLength(nFieldNameLen);
                                prefix.append(info);
                                String dupCheck = prefix.toString();

                                if (!regexDuplicates.contains(dupCheck)) {
                                    if ((null != m.flags) && m.flags.contains("H")) {
                                        info = StringEscapeUtils.unescapeHtml(info);
                                    }
                                    Llist.add(info);
                                    if (!bAllowDuplicates) {
                                        regexDuplicates.add(dupCheck);
                                    }
                                }
                            } else { // Apply regex to the string
                                Pattern dataRegex = createRegex(extraRegex, m.flags);
                                Matcher dataMatcher = dataRegex.matcher(info);
                                boolean result = dataMatcher.find();
                                while (result) {
                                    String toAdd;
                                    if (m.groupNum != null)
                                        toAdd = dataMatcher.group(m.groupNum);
                                    else
                                        toAdd = dataMatcher.group();
                                    prefix.setLength(nFieldNameLen);
                                    prefix.append(toAdd);
                                    String dupCheck = prefix.toString();

                                    if (!regexDuplicates.contains(dupCheck)) {
                                        if ((null != m.flags) && m.flags.contains("H")) {
                                            toAdd = StringEscapeUtils.unescapeHtml(toAdd);
                                        }
                                        Llist.add(toAdd);
                                        if (!bAllowDuplicates) {
                                            regexDuplicates.add(dupCheck);
                                        }
                                    }

                                    result = dataMatcher.find();
                                }
                            } //(regex vs no regex)
                        } //(end string vs object)
                    }
                    if (Llist.size() > 0) {
                        f.addToMetadata(m.fieldName, Llist.toArray());
                    }
                }
            } //(end loop over metadata objects if applicable)

        } catch (IOException ioe) {
            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(ioe).toString(),
                    true);

            // Just do nothing and log
            //DEBUG (don't output log messages per doc)
            //logger.error(ioe.getMessage());
        } catch (ParserConfigurationException e1) {
            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(),
                    true);
            // Just do nothing and log
            //DEBUG (don't output log messages per doc)
            //logger.error(e1.getMessage());
        } catch (XPathExpressionException e1) {
            _context.getHarvestStatus().logMessage("Error evaluating xpath expression: " + xpath, true);
        }
    } else if (m.scriptlang.equalsIgnoreCase("stream")) { // XML or JSON streaming interface
        // which one?
        try {
            boolean json = false;
            boolean xml = false;
            for (int i = 0; i < 128; ++i) {
                if ('<' == text.charAt(i)) {
                    xml = true;
                    break;
                }
                if ('{' == text.charAt(i) || '[' == text.charAt(i)) {
                    json = true;
                    break;
                }
                if (!Character.isSpaceChar(text.charAt(i))) {
                    break;
                }
            } //TESTED (too many spaces: meta_stream_test, test4; incorrect chars: test3, xml: test1, json: test2)

            boolean textNotObject = m.flags == null || !m.flags.contains("o");

            List<DocumentPojo> docs = new LinkedList<DocumentPojo>();
            List<String> levelOneFields = null;
            if (null != m.script) {
                levelOneFields = Arrays.asList(m.script.split("\\s*,\\s*"));
                if ((1 == levelOneFields.size()) && levelOneFields.get(0).isEmpty()) {
                    // convert [""] to null
                    levelOneFields = null;
                }
            } //TESTED (json and xml)

            if (xml) {
                XmlToMetadataParser parser = new XmlToMetadataParser(levelOneFields, null, null, null, null,
                        null, Integer.MAX_VALUE);
                XMLInputFactory factory = XMLInputFactory.newInstance();
                factory.setProperty(XMLInputFactory.IS_COALESCING, true);
                factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
                XMLStreamReader reader = null;
                try {
                    reader = factory.createXMLStreamReader(new ByteArrayInputStream(text.getBytes()));
                    docs = parser.parseDocument(reader, textNotObject);
                } finally {
                    if (null != reader)
                        reader.close();
                }
            } //TESTED (meta_stream_test, test1)
            if (json) {
                JsonReader jsonReader = null;
                try {
                    JsonToMetadataParser parser = new JsonToMetadataParser(null, levelOneFields, null, null,
                            Integer.MAX_VALUE);
                    jsonReader = new JsonReader(
                            new InputStreamReader(new ByteArrayInputStream(text.getBytes()), "UTF-8"));
                    jsonReader.setLenient(true);
                    docs = parser.parseDocument(jsonReader, textNotObject);
                } finally {
                    if (null != jsonReader)
                        jsonReader.close();
                }
            } //TESTED (meta_stream_test test2)

            if (!docs.isEmpty()) {
                ArrayList<String> Llist = null;
                ArrayList<Object> LlistObj = null;
                if (textNotObject) {
                    Llist = new ArrayList<String>(docs.size());
                } else {
                    LlistObj = new ArrayList<Object>(docs.size());
                }
                for (DocumentPojo doc : docs) {
                    if ((null != doc.getFullText()) || (null != doc.getMetadata())) {
                        if (textNotObject) {
                            Llist.add(doc.getFullText());
                        } //TESTED
                        else if (xml) {
                            LlistObj.add(doc.getMetadata());
                        } //TESTED
                        else if (json) {
                            Object o = doc.getMetadata();
                            if (null != o) {
                                o = doc.getMetadata().get("json");
                                if (o instanceof Object[]) {
                                    LlistObj.addAll(Arrays.asList((Object[]) o));
                                } else if (null != o) {
                                    LlistObj.add(o);
                                } //TESTED
                            }
                        } //TESTED
                    }
                } //TESTED
                if ((null != Llist) && !Llist.isEmpty()) {
                    f.addToMetadata(m.fieldName, Llist.toArray());
                } //TESTED
                if ((null != LlistObj) && !LlistObj.isEmpty()) {
                    f.addToMetadata(m.fieldName, LlistObj.toArray());
                } //TESTED

            } //TESTED (meta_stream_test test1,test2)
        } //(end try)
        catch (Exception e) { // various parsing errors
            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(),
                    true);
        }
    } //TESTED (meta_stream_test)

    // (don't currently support other script types)
}

From source file:net.xy.jcms.controller.configurations.parser.TranslationParser.java

/**
 * parses an xml configuration from an input streams. throwes
 * IllegalArgumentExceptions in case of syntax error.
 * /*from w  w  w  . j  a  v  a  2s. co m*/
 * @param in
 * @return value
 * @throws XMLStreamException
 * @throws ClassNotFoundException
 *             in case there are problems with an params type converter
 */
public static TranslationRule[] parse(final InputStream in, final ClassLoader loader)
        throws XMLStreamException, ClassNotFoundException {
    @SuppressWarnings("deprecation")
    final XMLInputFactory factory = XMLInputFactory.newInstance(
            "com.sun.xml.internal.stream.XMLInputFactoryImpl", TranslationParser.class.getClassLoader());
    LOG.info("XMLInputFactory loaded: " + factory.getClass().getName());
    factory.setProperty("javax.xml.stream.isCoalescing", true);
    // not supported be the reference implementation
    // factory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.TRUE);
    final XMLStreamReader parser = factory.createXMLStreamReader(in);
    while (parser.hasNext()) {
        final int event = parser.next();
        if (event == XMLStreamConstants.START_ELEMENT && parser.getName().getLocalPart().equals("rules")) {
            return parseRules(parser, loader);
        }
    }
    throw new IllegalArgumentException("No rules section found.");
}

From source file:net.xy.jcms.controller.configurations.parser.TranslationParser.java

/**
 * parses an single file translation//from  ww  w. ja  v  a2s. c o m
 * 
 * @param in
 * @param loader
 * @return value
 * @throws XMLStreamException
 * @throws ClassNotFoundException
 *             in case there are problems with an params type converter
 */
public static TranslationRule parseSingle(final InputStream in, final ClassLoader loader)
        throws XMLStreamException, ClassNotFoundException {
    @SuppressWarnings("deprecation")
    final XMLInputFactory factory = XMLInputFactory.newInstance(
            "com.sun.xml.internal.stream.XMLInputFactoryImpl", TranslationParser.class.getClassLoader());
    LOG.info("XMLInputFactory loaded: " + factory.getClass().getName());
    factory.setProperty("javax.xml.stream.isCoalescing", true);
    final XMLStreamReader parser = factory.createXMLStreamReader(in);
    while (parser.hasNext()) {
        final int event = parser.next();
        if (event == XMLStreamConstants.START_ELEMENT && parser.getName().getLocalPart().equals("rule")) {
            return parseRule(parser, loader);
        }
    }
    throw new IllegalArgumentException("No rules section found.");
}

From source file:net.xy.jcms.controller.configurations.parser.UsecaseParser.java

/**
 * parses usecases out from an xml file/*from w  w w  .  j a va 2 s. c  o  m*/
 * 
 * @param in
 * @param loader
 *            used for retrieving configuration included resources and also
 *            for retrieving the controllers
 * @return value
 * @throws XMLStreamException
 * @throws ClassNotFoundException
 */
public static Usecase[] parse(final InputStream in, final ClassLoader loader)
        throws XMLStreamException, ClassNotFoundException {
    final XMLInputFactory factory = XMLInputFactory.newInstance();
    factory.setProperty("javax.xml.stream.isCoalescing", true);
    // not supported by the reference implementation
    // factory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.TRUE);
    final XMLStreamReader parser = factory.createXMLStreamReader(in);
    while (parser.hasNext()) {
        final int event = parser.next();
        if (event == XMLStreamConstants.START_ELEMENT && parser.getName().getLocalPart().equals("usecases")) {
            return parseUsecases(parser, loader);
        }
    }
    throw new IllegalArgumentException("No usecases section found. [" + parser.getLocation() + "]");
}

From source file:net.xy.jcms.controller.configurations.parser.UsecaseParser.java

/**
 * method for parsing single usecase xml files. one per file.
 * // w  ww  .ja  v a2s  .  c  o m
 * @param in
 * @param loader
 * @return parsed usecase
 * @throws XMLStreamException
 * @throws ClassNotFoundException
 */
public static Usecase parseSingle(final InputStream in, final ClassLoader loader)
        throws XMLStreamException, ClassNotFoundException {
    final XMLInputFactory factory = XMLInputFactory.newInstance();
    factory.setProperty("javax.xml.stream.isCoalescing", true);
    // not supported by the reference implementation
    // factory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.TRUE);
    final XMLStreamReader parser = factory.createXMLStreamReader(in);
    while (parser.hasNext()) {
        final int event = parser.next();
        if (event == XMLStreamConstants.START_ELEMENT && parser.getName().getLocalPart().equals("usecase")) {
            return parseUsecase(parser, loader);
        }
    }
    throw new IllegalArgumentException("No usecases section found. [" + parser.getLocation() + "]");
}

From source file:org.activiti.bpmn.converter.BpmnXMLConverter.java

public BpmnModel convertToBpmnModel(InputStreamProvider inputStreamProvider, boolean validateSchema,
        boolean enableSafeBpmnXml, String encoding) {
    XMLInputFactory xif = XMLInputFactory.newInstance();

    if (xif.isPropertySupported(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES)) {
        xif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, false);
    }/*from   www .  j  ava2  s.com*/

    if (xif.isPropertySupported(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES)) {
        xif.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
    }

    if (xif.isPropertySupported(XMLInputFactory.SUPPORT_DTD)) {
        xif.setProperty(XMLInputFactory.SUPPORT_DTD, false);
    }

    InputStreamReader in = null;
    try {
        in = new InputStreamReader(inputStreamProvider.getInputStream(), encoding);
        XMLStreamReader xtr = xif.createXMLStreamReader(in);

        try {
            if (validateSchema) {

                if (!enableSafeBpmnXml) {
                    validateModel(inputStreamProvider);
                } else {
                    validateModel(xtr);
                }

                // The input stream is closed after schema validation
                in = new InputStreamReader(inputStreamProvider.getInputStream(), encoding);
                xtr = xif.createXMLStreamReader(in);
            }

        } catch (Exception e) {
            throw new RuntimeException("Could not validate XML with BPMN 2.0 XSD", e);
        }

        // XML conversion
        return convertToBpmnModel(xtr);
    } catch (UnsupportedEncodingException e) {
        throw new RuntimeException("The bpmn 2.0 xml is not UTF8 encoded", e);
    } catch (XMLStreamException e) {
        throw new RuntimeException("Error while reading the BPMN 2.0 XML", e);
    } finally {
        if (in != null) {
            try {
                in.close();
            } catch (IOException e) {
                LOGGER.debug("Problem closing BPMN input stream", e);
            }
        }
    }
}