Example usage for com.google.gson.stream JsonReader setLenient

List of usage examples for com.google.gson.stream JsonReader setLenient

Introduction

In this page you can find the example usage for com.google.gson.stream JsonReader setLenient.

Prototype

public final void setLenient(boolean lenient) 

Source Link

Document

Configure this parser to be liberal in what it accepts.

Usage

From source file:com.division.jsonrpc.JsonClient.java

License:Apache License

public void sendRequest(final JsonRequest request, final Type returnType, final RPCCallback callback) {
    Thread t = new Thread() {
        public void run() {
            RPCTransport response;//w  w  w  .jav  a2  s .  c  o m
            String jRequest = gson.toJson(request);
            HttpURLConnection urlConnection = null;
            try {
                URL server = new URL(rpcServer);
                urlConnection = (HttpURLConnection) server.openConnection();
                urlConnection.setRequestMethod("POST");
                urlConnection.setRequestProperty("content-type", "application/json");
                urlConnection.setDoOutput(true);
                OutputStream oStream = urlConnection.getOutputStream();
                System.out.println(jRequest);
                oStream.write(jRequest.getBytes());
                oStream.flush();
                oStream.close();

                InputStream rStream = urlConnection.getInputStream();
                String responseString = "";
                int bytes;
                byte[] bytesRec = new byte[51200];
                while ((bytes = rStream.read(bytesRec, 0, bytesRec.length)) != -1) {
                    responseString += new String(bytesRec, 0, bytes);
                }
                rStream.close();
                responseString = responseString.trim();
                if (callback != null) {
                    JsonReader jReader = new JsonReader(new StringReader(responseString));
                    jReader.setLenient(true);
                    response = gson.fromJson(jReader, returnType);
                    callback.onResponseReceived(response);
                }
                urlConnection.disconnect();
            } catch (Exception e) {
                e.printStackTrace();
                System.out.println("JSONRPC: EXCEPTION");
            } finally {
                if (urlConnection != null) {
                    urlConnection.disconnect();
                }
            }
        }
    };
    t.setDaemon(true);
    t.start();
}

From source file:com.epam.wilma.core.processor.entity.PrettyPrintProcessor.java

License:Open Source License

private String tryToParseJson(final WilmaHttpEntity entity, final String body) {
    String result = body;/*from  ww  w  .  j  a v  a 2  s.  co m*/
    try {
        Gson gson = new GsonBuilder().setPrettyPrinting().serializeNulls().create();
        JsonParser parser = new JsonParser();
        JsonReader reader = new JsonReader(new StringReader(body));
        reader.setLenient(true);
        JsonElement element = parser.parse(reader);
        result = gson.toJson(element);
    } catch (Exception e) {
        logError(entity, e);
    }
    return result;
}

From source file:com.flipkart.android.proteus.demo.converter.GsonResponseBodyConverter.java

License:Apache License

@Override
public T convert(ResponseBody value) throws IOException {
    TypeAdapter<T> adapter = getAdapter();
    JsonReader jsonReader = gson.newJsonReader(value.charStream());
    jsonReader.setLenient(true);
    try {//from  w  w  w.j  a v a 2  s.  com
        return adapter.read(jsonReader);
    } finally {
        value.close();
    }
}

From source file:com.getperka.flatpack.fast.JavaDialect.java

License:Apache License

/**
 * If {@value #concreteTypeMapFile} is defined, load the file into {@link #concreteTypeMap}.
 *//*from www .  j a va2  s  .co  m*/
private void loadConcreteTypeMap() throws IOException {
    if (baseTypeArrayFile != null) {
        JsonReader reader = new JsonReader(new InputStreamReader(new FileInputStream(baseTypeArrayFile), UTF8));
        reader.setLenient(true);
        reader.beginArray();
        while (reader.hasNext()) {
            baseTypes.add(reader.nextString());
        }
        reader.endArray();
        reader.close();
    }
}

From source file:com.getperka.flatpack.Unpacker.java

License:Apache License

private <T> FlatPackEntity<T> unpack(Type returnType, JsonReader reader, Principal principal)
        throws IOException {
    // Hold temporary state for deserialization
    DeserializationContext context = contexts.get();

    /*/* w  ww  .  java 2 s .co  m*/
     * Decoding is done as a two-pass operation since the runtime type of an allocated object cannot
     * be swizzled. The per-entity data is held as a semi-reified JsonObject to be passed off to a
     * Codex.
     */
    Map<HasUuid, JsonObject> entityData = FlatPackCollections.mapForIteration();
    // Used to populate the entityData map
    JsonParser jsonParser = new JsonParser();
    /*
     * The reader is placed in lenient mode as an aid to developers, however all output will be
     * strictly formatted.
     */
    reader.setLenient(true);

    // The return value
    @SuppressWarnings("unchecked")
    FlatPackEntity<T> toReturn = (FlatPackEntity<T>) FlatPackEntity.create(returnType, null, principal);
    // Stores the single, top-level value in the payload for two-pass reification
    JsonElement value = null;

    if (reader.peek().equals(JsonToken.NULL)) {
        return toReturn;
    }

    reader.beginObject();

    while (JsonToken.NAME.equals(reader.peek())) {
        String name = reader.nextName();
        if ("data".equals(name)) {
            // data : { "fooEntity" : [ { ... }, { ... } ]
            reader.beginObject();
            while (JsonToken.NAME.equals(reader.peek())) {
                // Turn "fooEntity" into the actual FooEntity class
                String simpleName = reader.nextName();
                Class<? extends HasUuid> clazz = typeContext.getClass(simpleName);
                if (clazz == null) {
                    if (ignoreUnresolvableTypes) {
                        reader.skipValue();
                        continue;
                    } else {
                        throw new UnsupportedOperationException("Cannot resolve type " + simpleName);
                    }
                } else if (Modifier.isAbstract(clazz.getModifiers())) {
                    throw new UnsupportedOperationException(
                            "A subclass of " + simpleName + " must be used instead");
                }
                context.pushPath("allocating " + simpleName);
                try {
                    // Find the Codex for the requested entity type
                    EntityCodex<?> codex = (EntityCodex<?>) typeContext.getCodex(clazz);

                    // Take the n-many property objects and stash them for later decoding
                    reader.beginArray();
                    while (!JsonToken.END_ARRAY.equals(reader.peek())) {
                        JsonObject chunk = jsonParser.parse(reader).getAsJsonObject();
                        HasUuid entity = codex.allocate(chunk, context);
                        toReturn.addExtraEntity(entity);
                        entityData.put(entity, chunk.getAsJsonObject());
                    }
                    reader.endArray();
                } finally {
                    context.popPath();
                }
            }
            reader.endObject();
        } else if ("errors".equals(name)) {
            // "errors" : { "path" : "problem", "path2" : "problem2" }
            reader.beginObject();
            while (JsonToken.NAME.equals(reader.peek())) {
                String path = reader.nextName();
                if (JsonToken.STRING.equals(reader.peek()) || JsonToken.NUMBER.equals(reader.peek())) {
                    toReturn.addError(path, reader.nextString());
                } else {
                    reader.skipValue();
                }
            }
            reader.endObject();
        } else if ("metadata".equals(name)) {
            reader.beginArray();

            while (!JsonToken.END_ARRAY.equals(reader.peek())) {
                EntityMetadata meta = new EntityMetadata();
                JsonObject metaElement = jsonParser.parse(reader).getAsJsonObject();
                metaCodex.get().readProperties(meta, metaElement, context);
                toReturn.addMetadata(meta);
            }

            reader.endArray();
        } else if ("value".equals(name)) {
            // Just stash the value element in case it occurs first
            value = jsonParser.parse(reader);
        } else if (JsonToken.STRING.equals(reader.peek()) || JsonToken.NUMBER.equals(reader.peek())) {
            // Save off any other simple properties
            toReturn.putExtraData(name, reader.nextString());
        } else {
            // Ignore random other entries
            reader.skipValue();
        }
    }

    reader.endObject();
    reader.close();

    for (Map.Entry<HasUuid, JsonObject> entry : entityData.entrySet()) {
        HasUuid entity = entry.getKey();
        EntityCodex<HasUuid> codex = (EntityCodex<HasUuid>) typeContext.getCodex(entity.getClass());
        codex.readProperties(entity, entry.getValue(), context);
    }

    @SuppressWarnings("unchecked")
    Codex<T> returnCodex = (Codex<T>) typeContext.getCodex(toReturn.getType());
    toReturn.withValue(returnCodex.read(value, context));

    for (Map.Entry<UUID, String> entry : context.getWarnings().entrySet()) {
        toReturn.addWarning(entry.getKey().toString(), entry.getValue());
    }

    // Process metadata
    for (EntityMetadata meta : toReturn.getMetadata()) {
        if (meta.isPersistent()) {
            HasUuid entity = context.getEntity(meta.getUuid());
            if (entity instanceof PersistenceAware) {
                ((PersistenceAware) entity).markPersistent();
            }
        }
    }

    context.runPostWork();
    context.close();

    return toReturn;
}

From source file:com.google.api.client.json.gson.GsonParser.java

License:Apache License

GsonParser(GsonFactory factory, JsonReader reader) {
    this.factory = factory;
    this.reader = reader;
    // lenient to allow top-level values of any type
    reader.setLenient(true);
}

From source file:com.google.devtools.moe.client.project.ProjectConfig.java

License:Apache License

public static ProjectConfig parse(String configText) throws InvalidProject {
    ProjectConfig config = null;/* ww  w . ja  v  a 2  s  .  co m*/
    if (configText != null) {
        try {
            JsonReader configReader = new JsonReader(new StringReader(configText));
            configReader.setLenient(true);
            JsonElement configJsonElement = new JsonParser().parse(configReader);
            if (configJsonElement != null) {
                // Config files often contain JavaScript idioms like comments, single quoted strings,
                // and trailing commas in lists.
                // Check that the JSON parsed from configText is structurally the same as that
                // produced when it is interpreted by GSON in lax mode.
                String normalConfigText = JsonSanitizer.sanitize(configText);
                JsonElement normalConfigJsonElement = new JsonParser().parse(normalConfigText);
                JsonStructureChecker.requireSimilar(configJsonElement, normalConfigJsonElement);

                Gson gson = GsonModule.provideGson(); // TODO(user): Remove this static reference.
                config = gson.fromJson(configText, ProjectConfig.class);
            }
        } catch (JsonParseException e) {
            throw new InvalidProject(e, "Could not parse MOE config: " + e.getMessage());
        }
    }

    if (config == null) {
        throw new InvalidProject("Could not parse MOE config");
    }
    config.validate();
    return config;
}

From source file:com.google.samples.apps.iosched.sync.ConferenceDataHandler.java

License:Open Source License

/**
 * Processes a conference data body and calls the appropriate data type handlers
 * to process each of the objects represented therein.
 *
 * @param dataBody The body of data to process
 * @throws IOException If there is an error parsing the data.
 */// ww w . ja  va 2s. com
private void processDataBody(String dataBody) throws IOException {
    JsonReader reader = new JsonReader(new StringReader(dataBody));
    JsonParser parser = new JsonParser();
    try {
        reader.setLenient(true); // To err is human

        // the whole file is a single JSON object
        reader.beginObject();

        while (reader.hasNext()) {
            // the key is "rooms", "speakers", "tracks", etc.
            String key = reader.nextName();
            if (mHandlerForKey.containsKey(key)) {
                // pass the value to the corresponding handler
                mHandlerForKey.get(key).process(parser.parse(reader));
            } else {
                LOGW(TAG, "Skipping unknown key in conference data json: " + key);
                reader.skipValue();
            }
        }
        reader.endObject();
    } finally {
        reader.close();
    }
}

From source file:com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester.java

License:Open Source License

/**
 * processMeta - handle an individual field
 *///from  w w  w.  j a v a 2  s .c  o m
private void processMeta(DocumentPojo f, metaField m, String text, SourcePojo source,
        UnstructuredAnalysisConfigPojo uap) {

    boolean bAllowDuplicates = false;
    if ((null != m.flags) && m.flags.contains("U")) {
        bAllowDuplicates = true;
    }
    if ((null == m.scriptlang) || m.scriptlang.equalsIgnoreCase("regex")) {

        Pattern metaPattern = createRegex(m.script, m.flags);

        int timesToRun = 1;
        Object[] currField = null;
        if ((null != m.flags) && m.flags.contains("c")) {
            currField = f.getMetadata().get(m.fieldName);
        }
        if (null != currField) { // chained metadata
            timesToRun = currField.length;
            text = (String) currField[0];
        } //TESTED

        Matcher matcher = metaPattern.matcher(text);
        LinkedList<String> Llist = null;

        for (int ii = 0; ii < timesToRun; ++ii) {
            if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
                text = (String) currField[ii];
                matcher = metaPattern.matcher(text);
            } //TESTED

            StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
            int nFieldNameLen = m.fieldName.length() + 1;

            try {
                while (matcher.find()) {
                    if (null == Llist) {
                        Llist = new LinkedList<String>();
                    }
                    if (null == m.groupNum) {
                        m.groupNum = 0;
                    }
                    String toAdd = matcher.group(m.groupNum);
                    if (null != m.replace) {
                        toAdd = metaPattern.matcher(toAdd).replaceFirst(m.replace);
                    }
                    if ((null != m.flags) && m.flags.contains("H")) {
                        toAdd = StringEscapeUtils.unescapeHtml(toAdd);
                    }
                    prefix.setLength(nFieldNameLen);
                    prefix.append(toAdd);
                    String dupCheck = prefix.toString();

                    if (!regexDuplicates.contains(dupCheck)) {
                        Llist.add(toAdd);
                        if (!bAllowDuplicates) {
                            regexDuplicates.add(dupCheck);
                        }
                    }
                }
            } catch (Exception e) {
                this._context.getHarvestStatus().logMessage("processMeta1: " + e.getMessage(), true);
            }
        } //(end metadata chaining handling)
        if (null != Llist) {
            if (null != currField) { // (overwrite)
                f.getMetadata().put(m.fieldName, Llist.toArray());
            } else {
                f.addToMetadata(m.fieldName, Llist.toArray());
            }
        } //TESTED
    } else if (m.scriptlang.equalsIgnoreCase("javascript")) {
        if (null == f.getMetadata()) {
            f.setMetadata(new LinkedHashMap<String, Object[]>());
        }
        //set the script engine up if necessary
        if ((null != source) && (null != uap)) {
            //(these are null if called from new processing pipeline vs legacy code)
            intializeScriptEngine(source, uap);
        }

        try {
            //TODO (INF-2488): in new format, this should only happen in between contentMeta blocks/docs
            // (also should be able to use SAH _document object I think?)

            // Javascript: the user passes in 
            Object[] currField = f.getMetadata().get(m.fieldName);
            if ((null == m.flags) || m.flags.isEmpty()) {
                if (null == currField) {
                    engine.put("text", text);
                    engine.put("_iterator", null);
                }
                //(otherwise will just pass the current fields in there)
            } else { // flags specified
                if (m.flags.contains("t")) { // text
                    engine.put("text", text);
                }
                if (m.flags.contains("d")) { // entire document (minus ents and assocs)
                    GsonBuilder gb = new GsonBuilder();
                    Gson g = gb.create();
                    List<EntityPojo> ents = f.getEntities();
                    List<AssociationPojo> assocs = f.getAssociations();
                    try {
                        f.setEntities(null);
                        f.setAssociations(null);
                        engine.put("document", g.toJson(f));
                        securityManager.eval(engine, JavaScriptUtils.initScript);
                    } finally {
                        f.setEntities(ents);
                        f.setAssociations(assocs);
                    }
                }
                if (m.flags.contains("m")) { // metadata
                    GsonBuilder gb = new GsonBuilder();
                    Gson g = gb.create();
                    engine.put("_metadata", g.toJson(f.getMetadata()));
                    securityManager.eval(engine, JavaScriptUtils.iteratorMetaScript);
                }
            } //(end flags processing)

            if (null != currField) {
                f.getMetadata().remove(m.fieldName);

                GsonBuilder gb = new GsonBuilder();
                Gson g = gb.create();
                engine.put("_iterator", g.toJson(currField));
                securityManager.eval(engine, JavaScriptUtils.iteratorDocScript);
            }
            //TESTED (handling of flags, and replacing of existing fields, including when field is null but specified)

            Object returnVal = securityManager.eval(engine, m.script);

            if (null != returnVal) {
                if (returnVal instanceof String) { // The only easy case
                    Object[] array = new Object[1];
                    if ((null != m.flags) && m.flags.contains("H")) {
                        returnVal = StringEscapeUtils.unescapeHtml((String) returnVal);
                    }
                    array[0] = returnVal;
                    f.addToMetadata(m.fieldName, array);
                } else { // complex object or array - in either case the engine turns these into
                         // internal.NativeArray or internal.NativeObject

                    BasicDBList outList = JavaScriptUtils.parseNativeJsObject(returnVal, engine);
                    f.addToMetadata(m.fieldName, outList.toArray());
                }
            }
        } catch (ScriptException e) {

            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(),
                    true);

            // Just do nothing and log
            // e.printStackTrace();
            //DEBUG (don't output log messages per doc)
            //logger.error(e.getMessage());
        } catch (Exception e) {

            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(),
                    true);

            // Just do nothing and log
            // e.printStackTrace();
            //DEBUG (don't output log messages per doc)
            //logger.error(e.getMessage());
        }
    } else if (m.scriptlang.equalsIgnoreCase("xpath")) {

        String xpath = m.script;

        try {
            createHtmlCleanerIfNeeded();

            int timesToRun = 1;
            Object[] currField = null;
            if ((null != m.flags) && m.flags.contains("c")) {
                currField = f.getMetadata().get(m.fieldName);
            }
            if (null != currField) { // chained metadata
                f.getMetadata().remove(m.fieldName); // (so will add to the end)
                timesToRun = currField.length;
                text = (String) currField[0];
            } //TESTED

            for (int ii = 0; ii < timesToRun; ++ii) {
                if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
                    text = (String) currField[ii];
                } //TESTED

                TagNode node = cleaner.clean(new ByteArrayInputStream(text.getBytes()));

                //NewCode : Only use html cleaner for cleansing
                //use JAXP for full Xpath lib
                Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);

                String extraRegex = extractRegexFromXpath(xpath);

                if (extraRegex != null)
                    xpath = xpath.replace(extraRegex, "");

                XPath xpa = XPathFactory.newInstance().newXPath();
                NodeList res = (NodeList) xpa.evaluate(xpath, doc, XPathConstants.NODESET);

                if (res.getLength() > 0) {
                    if ((null != m.flags) && (m.flags.contains("o"))) { // "o" for object
                        m.groupNum = -1; // (see bConvertToObject below)
                    }
                    StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
                    int nFieldNameLen = m.fieldName.length() + 1;
                    ArrayList<Object> Llist = new ArrayList<Object>(res.getLength());
                    boolean bConvertToObject = ((m.groupNum != null) && (m.groupNum == -1));
                    boolean convertToXml = ((null != m.flags) && (m.flags.contains("x")));
                    for (int i = 0; i < res.getLength(); i++) {
                        Node info_node = res.item(i);
                        if ((null != m.flags) && (m.flags.contains("g"))) {
                            Llist.add(parseHtmlTable(info_node, m.replace));
                        } else if (bConvertToObject || convertToXml) {
                            // Try to create a JSON object out of this
                            StringWriter writer = new StringWriter();
                            try {
                                Transformer transformer = TransformerFactory.newInstance().newTransformer();
                                transformer.transform(new DOMSource(info_node), new StreamResult(writer));
                            } catch (TransformerException e1) {
                                continue;
                            }

                            if (bConvertToObject) {
                                try {
                                    JSONObject subObj = XML.toJSONObject(writer.toString());
                                    if (xpath.endsWith("*")) { // (can have any number of different names here)
                                        Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj));
                                    } //TESTED
                                    else {
                                        String[] rootNames = JSONObject.getNames(subObj);
                                        if (1 == rootNames.length) {
                                            // (don't think it can't be any other number in fact)
                                            subObj = subObj.getJSONObject(rootNames[0]);
                                        }
                                        boolean bUnescapeHtml = ((null != m.flags) && m.flags.contains("H"));
                                        Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj,
                                                bUnescapeHtml));
                                    } //TESTED
                                } catch (JSONException e) { // Just carry on
                                    continue;
                                }
                                //TESTED
                            } else { // leave in XML form
                                Llist.add(writer.toString().substring(38)); // +38: (step over <?xml version="1.0" encoding="UTF-8"?>)
                            } //TESTED (xpath_test.json)
                        } else { // Treat this as string, either directly or via regex
                            String info = info_node.getTextContent().trim();
                            if (extraRegex == null || extraRegex.isEmpty()) {
                                prefix.setLength(nFieldNameLen);
                                prefix.append(info);
                                String dupCheck = prefix.toString();

                                if (!regexDuplicates.contains(dupCheck)) {
                                    if ((null != m.flags) && m.flags.contains("H")) {
                                        info = StringEscapeUtils.unescapeHtml(info);
                                    }
                                    Llist.add(info);
                                    if (!bAllowDuplicates) {
                                        regexDuplicates.add(dupCheck);
                                    }
                                }
                            } else { // Apply regex to the string
                                Pattern dataRegex = createRegex(extraRegex, m.flags);
                                Matcher dataMatcher = dataRegex.matcher(info);
                                boolean result = dataMatcher.find();
                                while (result) {
                                    String toAdd;
                                    if (m.groupNum != null)
                                        toAdd = dataMatcher.group(m.groupNum);
                                    else
                                        toAdd = dataMatcher.group();
                                    prefix.setLength(nFieldNameLen);
                                    prefix.append(toAdd);
                                    String dupCheck = prefix.toString();

                                    if (!regexDuplicates.contains(dupCheck)) {
                                        if ((null != m.flags) && m.flags.contains("H")) {
                                            toAdd = StringEscapeUtils.unescapeHtml(toAdd);
                                        }
                                        Llist.add(toAdd);
                                        if (!bAllowDuplicates) {
                                            regexDuplicates.add(dupCheck);
                                        }
                                    }

                                    result = dataMatcher.find();
                                }
                            } //(regex vs no regex)
                        } //(end string vs object)
                    }
                    if (Llist.size() > 0) {
                        f.addToMetadata(m.fieldName, Llist.toArray());
                    }
                }
            } //(end loop over metadata objects if applicable)

        } catch (IOException ioe) {
            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(ioe).toString(),
                    true);

            // Just do nothing and log
            //DEBUG (don't output log messages per doc)
            //logger.error(ioe.getMessage());
        } catch (ParserConfigurationException e1) {
            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(),
                    true);
            // Just do nothing and log
            //DEBUG (don't output log messages per doc)
            //logger.error(e1.getMessage());
        } catch (XPathExpressionException e1) {
            _context.getHarvestStatus().logMessage("Error evaluating xpath expression: " + xpath, true);
        }
    } else if (m.scriptlang.equalsIgnoreCase("stream")) { // XML or JSON streaming interface
        // which one?
        try {
            boolean json = false;
            boolean xml = false;
            for (int i = 0; i < 128; ++i) {
                if ('<' == text.charAt(i)) {
                    xml = true;
                    break;
                }
                if ('{' == text.charAt(i) || '[' == text.charAt(i)) {
                    json = true;
                    break;
                }
                if (!Character.isSpaceChar(text.charAt(i))) {
                    break;
                }
            } //TESTED (too many spaces: meta_stream_test, test4; incorrect chars: test3, xml: test1, json: test2)

            boolean textNotObject = m.flags == null || !m.flags.contains("o");

            List<DocumentPojo> docs = new LinkedList<DocumentPojo>();
            List<String> levelOneFields = null;
            if (null != m.script) {
                levelOneFields = Arrays.asList(m.script.split("\\s*,\\s*"));
                if ((1 == levelOneFields.size()) && levelOneFields.get(0).isEmpty()) {
                    // convert [""] to null
                    levelOneFields = null;
                }
            } //TESTED (json and xml)

            if (xml) {
                XmlToMetadataParser parser = new XmlToMetadataParser(levelOneFields, null, null, null, null,
                        null, Integer.MAX_VALUE);
                XMLInputFactory factory = XMLInputFactory.newInstance();
                factory.setProperty(XMLInputFactory.IS_COALESCING, true);
                factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
                XMLStreamReader reader = null;
                try {
                    reader = factory.createXMLStreamReader(new ByteArrayInputStream(text.getBytes()));
                    docs = parser.parseDocument(reader, textNotObject);
                } finally {
                    if (null != reader)
                        reader.close();
                }
            } //TESTED (meta_stream_test, test1)
            if (json) {
                JsonReader jsonReader = null;
                try {
                    JsonToMetadataParser parser = new JsonToMetadataParser(null, levelOneFields, null, null,
                            Integer.MAX_VALUE);
                    jsonReader = new JsonReader(
                            new InputStreamReader(new ByteArrayInputStream(text.getBytes()), "UTF-8"));
                    jsonReader.setLenient(true);
                    docs = parser.parseDocument(jsonReader, textNotObject);
                } finally {
                    if (null != jsonReader)
                        jsonReader.close();
                }
            } //TESTED (meta_stream_test test2)

            if (!docs.isEmpty()) {
                ArrayList<String> Llist = null;
                ArrayList<Object> LlistObj = null;
                if (textNotObject) {
                    Llist = new ArrayList<String>(docs.size());
                } else {
                    LlistObj = new ArrayList<Object>(docs.size());
                }
                for (DocumentPojo doc : docs) {
                    if ((null != doc.getFullText()) || (null != doc.getMetadata())) {
                        if (textNotObject) {
                            Llist.add(doc.getFullText());
                        } //TESTED
                        else if (xml) {
                            LlistObj.add(doc.getMetadata());
                        } //TESTED
                        else if (json) {
                            Object o = doc.getMetadata();
                            if (null != o) {
                                o = doc.getMetadata().get("json");
                                if (o instanceof Object[]) {
                                    LlistObj.addAll(Arrays.asList((Object[]) o));
                                } else if (null != o) {
                                    LlistObj.add(o);
                                } //TESTED
                            }
                        } //TESTED
                    }
                } //TESTED
                if ((null != Llist) && !Llist.isEmpty()) {
                    f.addToMetadata(m.fieldName, Llist.toArray());
                } //TESTED
                if ((null != LlistObj) && !LlistObj.isEmpty()) {
                    f.addToMetadata(m.fieldName, LlistObj.toArray());
                } //TESTED

            } //TESTED (meta_stream_test test1,test2)
        } //(end try)
        catch (Exception e) { // various parsing errors
            _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(),
                    true);
        }
    } //TESTED (meta_stream_test)

    // (don't currently support other script types)
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.file.FileHarvester.java

License:Open Source License

private void parse(InfiniteFile f, SourcePojo source) throws MalformedURLException, URISyntaxException {

    //NOTE: we only ever break out of here because of max docs in standalone mode
    // (because we don't know how to continue reading)

    DocumentPojo doc = null;/*from ww  w  .  jav a2 s  . c  o m*/
    //Determine File Extension
    String fileName = f.getName().toString();

    int mid = fileName.lastIndexOf(".");
    String extension = fileName.substring(mid + 1, fileName.length());

    //Checked to save processing time
    long fileTimestamp = (f.getDate() / 1000) * 1000;
    // (ensure truncated to seconds, since some operation somewhere hear does this...)

    Date modDate = new Date(fileTimestamp);
    //XML Data gets placed into MetaData

    boolean bIsXml = false;
    boolean bIsJson = false;
    boolean bIsLineOriented = false;
    if ((null != source.getFileConfig()) && (null != source.getFileConfig().type)) {
        extension = source.getFileConfig().type;
    }
    bIsXml = extension.equalsIgnoreCase("xml");
    bIsJson = extension.equalsIgnoreCase("json");
    bIsLineOriented = extension.endsWith("sv");

    if (bIsXml || bIsJson || bIsLineOriented) {
        int debugMaxDocs = Integer.MAX_VALUE; // by default don't set this, it's only for debug mode
        if (_context.isStandalone()) { // debug mode
            debugMaxDocs = maxDocsPerCycle;
        }
        //fast check to see if the file has changed before processing (or if it never existed)
        if (needsUpdated_SourceUrl(modDate, f.getUrlString(), source)) {
            if (0 != modDate.getTime()) { // if it ==0 then sourceUrl doesn't exist at all, no need to delete
                // This file already exists - in normal/managed mode will re-create
                // In streaming mode, simple skip over
                if (_streaming) {
                    return;
                } //TESTED

                DocumentPojo docRepresentingSrcUrl = new DocumentPojo();
                docRepresentingSrcUrl.setSourceUrl(f.getUrlString());
                docRepresentingSrcUrl.setSourceKey(source.getKey());
                docRepresentingSrcUrl.setCommunityId(source.getCommunityIds().iterator().next());
                sourceUrlsGettingUpdated.add(docRepresentingSrcUrl.getSourceUrl());
                this.docsToRemove.add(docRepresentingSrcUrl);
                // (can add documents with just source URL, are treated differently in the core libraries)               
            }

            SourceFileConfigPojo fileSystem = source.getFileConfig();
            if ((null == fileSystem) && (bIsXml || bIsJson)) {
                fileSystem = new SourceFileConfigPojo();
            }
            XmlToMetadataParser xmlParser = null;
            JsonToMetadataParser jsonParser = null;
            String urlType = extension;
            if (bIsXml) {
                xmlParser = new XmlToMetadataParser(fileSystem.XmlRootLevelValues, fileSystem.XmlIgnoreValues,
                        fileSystem.XmlSourceName, fileSystem.XmlPrimaryKey, fileSystem.XmlAttributePrefix,
                        fileSystem.XmlPreserveCase, debugMaxDocs);
            } //TESTED
            else if (bIsJson) {
                jsonParser = new JsonToMetadataParser(fileSystem.XmlSourceName, fileSystem.XmlRootLevelValues,
                        fileSystem.XmlPrimaryKey, fileSystem.XmlIgnoreValues, debugMaxDocs);
            } //TESTED

            List<DocumentPojo> partials = null;
            try {
                if (bIsXml) {
                    XMLStreamReader xmlStreamReader = null;
                    XMLInputFactory factory = XMLInputFactory.newInstance();
                    factory.setProperty(XMLInputFactory.IS_COALESCING, true);
                    factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
                    try {
                        xmlStreamReader = factory.createXMLStreamReader(f.getInputStream());
                        partials = xmlParser.parseDocument(xmlStreamReader);
                        long memUsage = xmlParser.getMemUsage();
                        _memUsage += memUsage;
                        _totalMemUsage.addAndGet(memUsage);
                    } finally {
                        if (null != xmlStreamReader)
                            xmlStreamReader.close();
                    }
                } //TESTED
                else if (bIsJson) {
                    JsonReader jsonReader = null;
                    try {
                        jsonReader = new JsonReader(new InputStreamReader(f.getInputStream(), "UTF-8"));
                        jsonReader.setLenient(true);
                        partials = jsonParser.parseDocument(jsonReader);
                        long memUsage = jsonParser.getMemUsage();
                        _memUsage += memUsage;
                        _totalMemUsage.addAndGet(memUsage);
                    } finally {
                        if (null != jsonReader)
                            jsonReader.close();
                    }
                } //TESTED
                else if (bIsLineOriented) { // Just generate a document for every line

                    BufferedReader lineReader = null;
                    try {
                        lineReader = new BufferedReader(new InputStreamReader(f.getInputStream(), "UTF-8"));
                        CsvToMetadataParser lineParser = new CsvToMetadataParser(debugMaxDocs);
                        partials = lineParser.parseDocument(lineReader, source);
                        long memUsage = lineParser.getMemUsage();
                        _memUsage += memUsage;
                        _totalMemUsage.addAndGet(memUsage);
                    } finally {
                        if (null != lineReader)
                            lineReader.close();
                    }
                } //TESTED

                MessageDigest md5 = null; // (generates unique urls if the user doesn't below)
                try {
                    md5 = MessageDigest.getInstance("MD5");
                } catch (NoSuchAlgorithmException e) {
                    // Do nothing, unlikely to happen...
                }
                int nIndex = 0;
                int numPartials = partials.size();
                for (DocumentPojo doctoAdd : partials) {
                    nIndex++;
                    doctoAdd.setSource(source.getTitle());
                    doctoAdd.setSourceKey(source.getKey());
                    doctoAdd.setMediaType(source.getMediaType());
                    doctoAdd.setModified(new Date(fileTimestamp));
                    doctoAdd.setCreated(new Date());

                    if (null == doctoAdd.getUrl()) { // Can be set in the parser or here
                        doctoAdd.setHasDefaultUrl(true); // (ie cannot occur in a different src URL)

                        if (1 == numPartials) {
                            String urlString = f.getUrlString();
                            if (urlString.endsWith(urlType)) {
                                doctoAdd.setUrl(urlString);
                            } else {
                                doctoAdd.setUrl(
                                        new StringBuffer(urlString).append('.').append(urlType).toString());
                            }
                            // (we always set sourceUrl as the true url of the file, so want to differentiate the URL with
                            //  some useful information)
                        } else if (null == doctoAdd.getMetadata()) { // Line oriented case
                            doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/").append(nIndex)
                                    .append('.').append(urlType).toString());
                        } else {
                            if (null == md5) { // Will never happen, MD5 always exists
                                doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/")
                                        .append(doctoAdd.getMetadata().hashCode()).append('.').append(urlType)
                                        .toString());
                            } else { // This is the standard call if the XML parser has not been configured to build the URL
                                doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/")
                                        .append(DigestUtils.md5Hex(doctoAdd.getMetadata().toString()))
                                        .append('.').append(urlType).toString());
                            }
                        } //TESTED
                    }
                    doctoAdd.setTitle(f.getName().toString());
                    doctoAdd.setPublishedDate(new Date(fileTimestamp));
                    doctoAdd.setSourceUrl(f.getUrlString());

                    // Always add to files because I'm deleting the source URL
                    files.add(doctoAdd);
                } //TESTED 

            } catch (XMLStreamException e1) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
            } catch (FactoryConfigurationError e1) {
                errors++;
                _context.getHarvestStatus().logMessage(e1.getMessage(), true);

            } catch (IOException e1) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
            } catch (Exception e1) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
            }
        } //(end if needs updated)
    } else //Tika supports Excel,Word,Powerpoint,Visio, & Outlook Documents
    {
        // (This dedup tells me if it's an add/update vs ignore - qr.isDuplicate higher up tells me if I need to add or update)
        if (needsUpdated_Url(modDate, f.getUrlString(), source)) {

            Metadata metadata = null;
            InputStream in = null;
            try {

                doc = new DocumentPojo();

                // Create a tika object (first time only)
                if (null == _tika) {
                    this.initializeTika(_context, source);
                }

                // BUGGERY
                // NEED TO LIKELY SET LIMIT TO BE 30MB or 50MB and BYPASS ANYTHING OVER THAT BELOW IS THE CODE TO DO THAT
                // tika.setMaxStringLength(30*1024*1024);
                // Disable the string length limit
                _tika.setMaxStringLength(-1);
                //input = new FileInputStream(new File(resourceLocation));
                // Create a metadata object to contain the metadata

                metadata = new Metadata();
                // Parse the file and get the text of the file
                doc.setSource(source.getTitle());
                doc.setSourceKey(source.getKey());
                doc.setMediaType(source.getMediaType());
                String fullText = "";

                in = f.getInputStream();
                try {
                    if (null == _tikaOutputFormat) { // text only
                        fullText = _tika.parseToString(in, metadata);
                    } //TESTED
                    else { // XML/HMTL
                        _tika.getParser().parse(in, _tikaOutputFormat, metadata, _tikaOutputParseContext);
                        fullText = _tikaXmlFormatWriter.toString();
                        _tikaXmlFormatWriter.getBuffer().setLength(0);
                    } //TESTED
                } finally {
                    if (null != in)
                        in.close();
                }
                int descCap = 500;
                doc.setFullText(fullText);
                if (descCap > fullText.length()) {
                    descCap = fullText.length();
                }
                doc.setDescription(fullText.substring(0, descCap));
                doc.setModified(new Date(fileTimestamp));
                doc.setCreated(new Date());
                doc.setUrl(f.getUrlString());
                doc.setTitle(f.getName().toString());
                doc.setPublishedDate(new Date(fileTimestamp));

                long memUsage = (250L * (doc.getFullText().length() + doc.getDescription().length())) / 100L; // 25% overhead, 2x for string->byte
                _memUsage += memUsage;
                _totalMemUsage.addAndGet(memUsage);

                // If the metadata contains a more plausible date then use that
                try {
                    String title = metadata.get(Metadata.TITLE);
                    if (null != title) {
                        doc.setTitle(title);
                    }
                } catch (Exception e) { // Fine just carry on                  
                }
                try {
                    Date date = metadata.getDate(Metadata.CREATION_DATE); // MS Word
                    if (null != date) {
                        doc.setPublishedDate(date);
                    } else {
                        date = metadata.getDate(Metadata.DATE); // Dublin
                        if (null != date) {
                            doc.setPublishedDate(date);
                        } else {
                            date = metadata.getDate(Metadata.ORIGINAL_DATE);
                            if (null != date) {
                                doc.setPublishedDate(date);
                            }
                        }
                    }
                } catch (Exception e) { // Fine just carry on                  
                }
                //TESTED

                // If the metadata contains a geotag then apply that:
                try {
                    String lat = metadata.get(Metadata.LATITUDE);
                    String lon = metadata.get(Metadata.LONGITUDE);
                    if ((null != lat) && (null != lon)) {
                        GeoPojo gt = new GeoPojo();
                        gt.lat = Double.parseDouble(lat);
                        gt.lon = Double.parseDouble(lon);
                        doc.setDocGeo(gt);
                    }
                } catch (Exception e) { // Fine just carry on                  
                }

                // Save the entire metadata:
                doc.addToMetadata("_FILE_METADATA_", metadata);

                for (ObjectId communityId : source.getCommunityIds()) {
                    doc.setCommunityId(communityId);
                }
                files.add(doc);

                // Close the input stream
                in.close();
                in = null;

                //TESTED

            } catch (SmbException e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } catch (MalformedURLException e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } catch (UnknownHostException e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } catch (IOException e) {
                errors++;
                _context.getHarvestStatus().logMessage(e.getMessage(), true);
            } catch (TikaException e) {
                errors++;
                _context.getHarvestStatus().logMessage(e.getMessage(), true);
            } catch (Exception e) {
                errors++;
                _context.getHarvestStatus()
                        .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
            } finally { // Close the input stream if an error occurs
                if (null != in) {
                    try {
                        in.close();
                    } catch (IOException e) {
                        // All good, do nothing
                    }
                }
            } // end exception handling
        } // end dedup check
    } // end XML vs "office" app

    //DEBUG
    //System.out.println("FILE=" + files.size() + " / MEM=" + _memUsage + " VS " + Runtime.getRuntime().totalMemory());
}