com.ikanow.infinit.e.data_model.custom.InfiniteFileInputJsonParser.java Source code

Java tutorial

Introduction

Here is the source code for com.ikanow.infinit.e.data_model.custom.InfiniteFileInputJsonParser.java

Source

/*******************************************************************************
 * Copyright 2012 The Infinit.e Open Source Project
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.ikanow.infinit.e.data_model.custom;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;

import org.apache.commons.lang.StringEscapeUtils;
import org.bson.BSONObject;

import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.stream.JsonReader;
import com.google.gson.stream.JsonToken;
import com.ikanow.infinit.e.data_model.store.config.source.SourceFileConfigPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.mongodb.BasicDBObject;

//(taken from com.ikanow.infinit.e.harvest.extraction.document.file.JsonToMetadataParser)

public class InfiniteFileInputJsonParser implements InfiniteFileInputParser {

    private HashSet<String> objectIdentifiers = new HashSet<String>();
    private HashSet<String> recursiveObjectIdentifiers = new HashSet<String>();
    private HashSet<String> fieldsThatNeedToExist = new HashSet<String>();
    private String primaryKey = null;
    private String sourceName = null;
    private boolean bRecurse = false;

    private JsonParser parser = null;
    private JsonReader reader = null;

    private JsonArray _secondaryArray = null;
    private int _posInSecondaryArray = 0;

    ///////////////////////////////////////////////////////////////

    // INTERFACE CODE

    @Override
    public InfiniteFileInputParser initialize(InputStream inStream, SourceFileConfigPojo fileConfig)
            throws IOException {

        this.primaryKey = fileConfig.XmlPrimaryKey;
        this.sourceName = fileConfig.XmlSourceName;
        if (null != fileConfig.XmlRootLevelValues) {
            for (String objectId : fileConfig.XmlRootLevelValues) {
                if (objectId.startsWith("*")) {
                    this.bRecurse = true;
                    this.recursiveObjectIdentifiers.add(objectId.substring(1).toLowerCase());
                    throw new RuntimeException("JSON metadata parser: Don't currently support recursive parsing.");
                    //TODO (INF-2469): Not currently supported, it gets a bit tricky?
                } //TESTED
                this.objectIdentifiers.add(objectId.toLowerCase());
            }
        }
        if (null != fileConfig.XmlIgnoreValues) {
            this.fieldsThatNeedToExist.addAll(fileConfig.XmlIgnoreValues);
        }
        reader = new JsonReader(new InputStreamReader(inStream, "UTF-8"));
        reader.setLenient(true);
        parser = new JsonParser();

        return this;
    }

    @Override
    public BSONObject getNextRecord() throws IOException {
        if (null != _secondaryArray) {
            for (; _posInSecondaryArray < _secondaryArray.size();) {
                JsonElement meta2 = _secondaryArray.get(_posInSecondaryArray);
                _posInSecondaryArray++;

                BasicDBObject currObj = convertJsonToDocument(meta2);
                if (null != currObj) {
                    return currObj;
                }
            }
            _secondaryArray = null;
        } //TESTED
        return parseDocument();
    }

    @Override
    public void close() throws IOException {
        if (null != reader)
            reader.close();

    }

    @Override
    public String getCanonicalExtension() {
        return ".json";
    }

    ///////////////////////////////////////////////////////////////

    // PROCESSING CODE

    private boolean _inTopLevelArray = false;
    private JsonToken tok = JsonToken.BEGIN_OBJECT;

    public BasicDBObject parseDocument() throws IOException {

        // Different cases:
        // {} 
        // ^^ many of these
        // [ {}, {}, {} ]
        // For each of these 2/3 cases, you might either want to grab the entire object, or a field
        // within the object

        try {
            while (true) { // (use exceptions to get outta here) 

                try {
                    tok = reader.peek();
                } catch (Exception e) {
                    // EOF or end of object, keep going and find out...
                    tok = reader.peek();
                }
                //TESTED

                if (JsonToken.BEGIN_ARRAY == tok) {
                    if (!_inTopLevelArray) {
                        reader.beginArray();
                        _inTopLevelArray = true;
                    }
                    if (objectIdentifiers.isEmpty()) {
                        while (reader.hasNext()) {
                            JsonElement meta = parser.parse(reader);
                            BasicDBObject currObj = convertJsonToDocument(meta);

                            if (null != currObj) {
                                return currObj;
                            } //(else carry on...)
                        }
                    } //TESTED
                    else {
                        while (reader.hasNext()) {
                            BasicDBObject currObj = getDocumentFromJson(false);
                            if (null != currObj) {
                                return currObj;
                            } //(else carry on...)
                        }
                    } //TESTED
                } else if (JsonToken.BEGIN_OBJECT == tok) {
                    if (objectIdentifiers.isEmpty()) {
                        JsonElement meta = parser.parse(reader);
                        BasicDBObject currObj = convertJsonToDocument(meta);

                        if (null != currObj) {
                            return currObj;
                        } //(else carry on...)

                    } //TESTED (single and multiple doc case)
                    else {
                        BasicDBObject currObj = getDocumentFromJson(false);
                        if (null != currObj) {
                            return currObj;
                        } //(else carry on...)

                    } //TESTED (single and multiple doc case)   
                } else if ((JsonToken.END_DOCUMENT == tok) || (JsonToken.END_ARRAY == tok)
                        || (JsonToken.END_OBJECT == tok)) {
                    return null;
                } else { // Must be recursing through the next level(s)
                    BasicDBObject currObj = getDocumentFromJson(false);
                    if (null != currObj) {
                        return currObj;
                    } //(else carry on...)               
                }
            } // (end loop forever - exception out)
        } catch (Exception e) {
        } // This is our EOF

        return null;
    }

    ////////////////////////////

    // Look into the JSON object and find the object with the specified name
    // (for now the "path" is ignored - maybe later we allow "x.y" terminology)

    private boolean _inSecondaryObject = false;

    private BasicDBObject getDocumentFromJson(boolean bRecursing) throws IOException {
        if (!_inSecondaryObject) {
            reader.beginObject();
            _inSecondaryObject = true;
        }

        while (reader.hasNext()) {
            String name = reader.nextName();

            boolean bMatch = false;
            if (bRecursing) {
                bMatch = recursiveObjectIdentifiers.contains(name.toLowerCase());
            } else {
                bMatch = objectIdentifiers.contains(name.toLowerCase());
            } //TESTED

            if (bMatch) {
                JsonElement meta = parser.parse(reader);

                if (meta.isJsonObject()) {

                    BasicDBObject currObj = convertJsonToDocument(meta);
                    if (null != currObj) {
                        return currObj;
                    }
                } //TESTED
                else if (meta.isJsonArray()) {
                    _secondaryArray = meta.getAsJsonArray();
                    _posInSecondaryArray = 0;
                    for (JsonElement meta2 : _secondaryArray) {
                        _posInSecondaryArray++;
                        BasicDBObject currObj = convertJsonToDocument(meta2);
                        if (null != currObj) {
                            return currObj;
                        }
                    }
                    _secondaryArray = null;
                } //TESTED

            } //TESTED
            else {
                if (bRecurse) { //TODO (INF-2469): Not currently supported, it gets a bit tricky? (need to convert to a stack)

                    JsonToken tok = reader.peek();
                    if (JsonToken.BEGIN_OBJECT == tok) {
                        BasicDBObject currObj = getDocumentFromJson(true);
                        if (null != currObj) {
                            return currObj;
                        }
                    } //TESTED
                    else if (JsonToken.BEGIN_ARRAY == tok) {
                        reader.beginArray();
                        while (reader.hasNext()) {
                            JsonToken tok2 = reader.peek();

                            if (JsonToken.BEGIN_OBJECT == tok2) {
                                BasicDBObject currObj = getDocumentFromJson(true);
                                if (null != currObj) {
                                    return currObj;
                                }
                            } else {
                                reader.skipValue();
                            } //TESTED

                        } //TESTED      
                        reader.endArray();
                    } else {
                        reader.skipValue();
                    } //TESTED
                } else {
                    reader.skipValue();
                } //TESTED
            }

        } //(end loop over reader)

        reader.endObject();
        _inSecondaryObject = false;

        return null;

    } //TESTED

    ////////////////////////////////////////////////////////////////////////////////

    // Utility - Check the object is well formed

    private boolean checkIfMandatoryFieldsExist(JsonElement meta) {
        if ((null != this.fieldsThatNeedToExist) && !this.fieldsThatNeedToExist.isEmpty()) {
            boolean fieldsExist = false;

            for (String field : this.fieldsThatNeedToExist) {
                String exists = getKey(meta, field, false);
                if (null != exists) {
                    fieldsExist = true;
                    break;
                }
            }
            return fieldsExist;
        }
        return true;
    }//TESTED

    /////////////////////////////////

    // Utility - get the primary key (does handle recursion)

    private String getPrimaryKey(JsonElement meta) {
        return getKey(meta, primaryKey, true);
    }

    /////////////////////////////////

    // Utility - create document set

    private BasicDBObject convertJsonToDocument(JsonElement meta) {

        // Check if all required fields exist:
        if (!checkIfMandatoryFieldsExist(meta)) {
            return null;
        }
        //TESTED

        // Primary key and create doc
        BasicDBObject currObj = new BasicDBObject();
        if ((null != primaryKey) && (null != sourceName)) {
            String primaryKey = getPrimaryKey(meta);
            if (null != primaryKey) {
                currObj.put(DocumentPojo.url_, sourceName + primaryKey);
            }
        }

        if (meta.isJsonObject()) {
            currObj.put(DocumentPojo.metadata_,
                    new BasicDBObject("json", Arrays.asList(convertJsonObjectToBson(meta.getAsJsonObject()))));
        }
        return currObj;

    } //TESTED

    // Utility - get an arbitrary key (does handle recursion)

    private String getKey(JsonElement meta, String key, boolean bPrimitiveOnly) {
        try {
            String[] components = key.split("\\.");
            JsonObject metaObj = meta.getAsJsonObject();
            for (String comp : components) {
                meta = metaObj.get(comp);

                if (null == meta) {
                    return null;
                } //TESTED
                else if (meta.isJsonObject()) {
                    metaObj = meta.getAsJsonObject();
                } //TESTED
                else if (meta.isJsonPrimitive()) {
                    return meta.getAsString();
                } //TESTED
                else if (bPrimitiveOnly) { // (meta isn't allowed to be an array, then you'd have too many primary keys!)
                    return null;
                } //TOTEST (? - see JsonToMetadataParser)
                else { // Check with first instance
                    JsonArray array = meta.getAsJsonArray();
                    meta = array.get(0);
                    if (meta.isJsonObject()) {
                        metaObj = meta.getAsJsonObject();
                    }
                } //TESTED            
            }
            if (!bPrimitiveOnly) { // allow objects, we just care if the field exists...
                if (null != metaObj) {
                    return "[Object]";
                }
            } //TESTED
        } catch (Exception e) {
        } // no primary key

        return null;
    }
    //(TEST status unknown - see JsonToMetadataParser)

    /////////////////////////////////

    // Utility - conversion

    /**
     * Converts a JsonObject to a LinkedHashMap.
     * @param json  JSONObject to convert
     */
    static private int capacity(int expectedSize) {
        if (expectedSize < 3) {
            return expectedSize + 1;
        }
        return expectedSize + expectedSize / 3;
    }

    static public BasicDBObject convertJsonObjectToBson(JsonObject json) {
        return convertJsonObjectToBson(json, false);
    }

    static public BasicDBObject convertJsonObjectToBson(JsonObject json, boolean bHtmlUnescape) {
        int length = json.entrySet().size();
        BasicDBObject list = new BasicDBObject(capacity(length));
        for (Map.Entry<String, JsonElement> jsonKeyEl : json.entrySet()) {
            JsonElement jsonEl = jsonKeyEl.getValue();
            if (jsonEl.isJsonArray()) {
                list.put(jsonKeyEl.getKey(), handleJsonArray(jsonEl.getAsJsonArray(), bHtmlUnescape));
            } else if (jsonEl.isJsonObject()) {
                list.put(jsonKeyEl.getKey(), convertJsonObjectToBson(jsonEl.getAsJsonObject(), bHtmlUnescape));
            } else if (jsonEl.isJsonPrimitive()) {
                if (bHtmlUnescape) {
                    list.put(jsonKeyEl.getKey(), StringEscapeUtils.unescapeHtml(jsonEl.getAsString()));
                } else {
                    list.put(jsonKeyEl.getKey(), jsonEl.getAsString());
                }
            }
        }
        if (list.size() > 0) {
            return list;
        }
        return null;
    }
    //TESTED

    static private Object[] handleJsonArray(JsonArray jarray, boolean bHtmlUnescape) {
        Object o[] = new Object[jarray.size()];
        for (int i = 0; i < jarray.size(); i++) {
            JsonElement jsonEl = jarray.get(i);
            if (jsonEl.isJsonObject()) {
                o[i] = convertJsonObjectToBson(jsonEl.getAsJsonObject(), bHtmlUnescape);
            }
            if (jsonEl.isJsonArray()) {
                o[i] = handleJsonArray(jsonEl.getAsJsonArray(), bHtmlUnescape);
            } else if (jsonEl.isJsonPrimitive()) {
                if (bHtmlUnescape) {
                    o[i] = StringEscapeUtils.unescapeHtml(jsonEl.getAsString());
                } else {
                    o[i] = jsonEl.getAsString();
                }
            }
        }
        return o;
    }
    //TESTED
}