org.lenskit.data.dao.file.TextEntitySource.java Source code

Java tutorial

Introduction

Here is the source code for org.lenskit.data.dao.file.TextEntitySource.java

Source

/*
 * LensKit, an open source recommender systems toolkit.
 * Copyright 2010-2014 LensKit Contributors.  See CONTRIBUTORS.md.
 * Work on LensKit has been funded by the National Science Foundation under
 * grants IIS 05-34939, 08-08692, 08-12148, and 10-17697.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc., 51
 * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */
package org.lenskit.data.dao.file;

import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.base.Charsets;
import com.google.common.io.CharSource;
import com.google.common.io.Files;
import com.google.common.io.Resources;
import org.apache.commons.lang3.ClassUtils;
import org.grouplens.grapht.util.ClassLoaders;
import org.lenskit.data.entities.*;
import org.lenskit.util.io.LineStream;
import org.lenskit.util.io.ObjectStream;
import org.lenskit.util.io.ObjectStreams;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

/**
 * Entity reader that loads entities from text data, often stored in a file.
 */
public class TextEntitySource implements EntitySource {
    private static final Logger logger = LoggerFactory.getLogger(TextEntitySource.class);
    private final String name;
    private CharSource source;
    private URL sourceURL;
    private EntityFormat format;

    /**
     * Construct a new text entity source.
     */
    public TextEntitySource() {
        this("<unnamed>");
    }

    /**
     * Construct a new text entity source.
     * @param name The source's name.
     */
    public TextEntitySource(String name) {
        this.name = name;
    }

    /**
     * Get the name of this data source.
     * @return The data source name.
     */
    public String getName() {
        return name;
    }

    /**
     * Set the source file for this reader.
     * @param file The source file.
     */
    public void setFile(Path file) {
        source = Files.asCharSource(file.toFile(), Charset.defaultCharset());
        try {
            sourceURL = file.toUri().toURL();
        } catch (MalformedURLException e) {
            throw new IllegalArgumentException("invalid URL " + file, e);
        }
    }

    /**
     * Set the URL of the input data.
     * @param url The URL of the input data.
     */
    public void setURI(URL url) {
        sourceURL = url;
        source = Resources.asCharSource(url, Charsets.UTF_8);
    }

    /**
     * Get the source file for this data source.
     * @return The source file.
     */
    public URL getURL() {
        return sourceURL;
    }

    /**
     * Set a string from which to read entities.
     */
    public void setSource(CharSequence text) {
        source = CharSource.wrap(text);
        sourceURL = null;
    }

    /**
     * Set the entity format for the reader.
     * @param format The entity format.
     */
    public void setFormat(EntityFormat format) {
        this.format = format;
    }

    /**
     * Get the entity format for the reader.
     * @return The entity format.
     */
    public EntityFormat getFormat() {
        return format;
    }

    /**
     * Open a stream to read entities from this source.
     * @return A stream of entities.
     */
    @Override
    public ObjectStream<Entity> openStream() throws IOException {
        BufferedReader reader = source.openBufferedStream();
        ObjectStream<String> lines = new LineStream(reader);
        int headerLines = format.getHeaderLines();
        List<String> header = new ArrayList<>();
        while (header.size() < headerLines) {
            String line = lines.readObject();
            if (line == null) {
                IOException ex = new IOException(
                        String.format("expected %d header lines, found %d", headerLines, header.size()));
                try {
                    lines.close();
                } catch (Throwable th) {
                    ex.addSuppressed(th);
                }
                throw ex;
            }
            header.add(line);
        }
        LineEntityParser parser = format.makeParser(header);
        return ObjectStreams.transform(lines, parser);
    }

    /**
     * Create a file reader.
     * @param name The reader name.
     * @param object The configuring object.
     * @param base The base URI for source data.
     * @return The new entity reader.
     */
    static TextEntitySource fromJSON(String name, JsonNode object, URI base) {
        return fromJSON(name, object, base, ClassLoaders.inferDefault(TextEntitySource.class));
    }

    /**
     * Create a file reader.
     * @param name The reader name.
     * @param object The configuring object.
     * @param base The base URI for source data.
     * @return The new entity reader.
     */
    static TextEntitySource fromJSON(String name, JsonNode object, URI base, ClassLoader loader) {
        TextEntitySource source = new TextEntitySource(name);
        URI uri = base.resolve(object.get("file").asText());
        try {
            source.setURI(uri.toURL());
        } catch (MalformedURLException e) {
            throw new IllegalArgumentException("Cannot resolve URI " + uri, e);
        }
        logger.info("loading text file source {} to read from {}", name, source.getURL());

        String fmt = object.path("format").asText("delimited").toLowerCase();
        String delim;
        switch (fmt) {
        case "csv":
            delim = ",";
            break;
        case "tsv":
        case "delimited":
            delim = "\t";
            break;
        default:
            throw new IllegalArgumentException("unsupported data format " + fmt);
        }
        JsonNode delimNode = object.path("delimiter");
        if (delimNode.isValueNode()) {
            delim = delimNode.asText();
        }

        DelimitedColumnEntityFormat format = new DelimitedColumnEntityFormat();
        format.setDelimiter(delim);
        logger.debug("{}: using delimiter {}", name, delim);
        JsonNode header = object.path("header");
        boolean canUseColumnMap = false;
        if (header.isBoolean() && header.asBoolean()) {
            logger.debug("{}: reading header", name);
            format.setHeader(true);
            canUseColumnMap = true;
        } else if (header.isNumber()) {
            format.setHeaderLines(header.asInt());
            logger.debug("{}: skipping {} header lines", format.getHeaderLines());
        }

        String eTypeName = object.path("entity_type").asText("rating").toLowerCase();
        EntityType etype = EntityType.forName(eTypeName);
        logger.debug("{}: reading entities of type {}", name, etype);
        EntityDefaults entityDefaults = EntityDefaults.lookup(etype);
        format.setEntityType(etype);
        format.setEntityBuilder(
                entityDefaults != null ? entityDefaults.getDefaultBuilder() : BasicEntityBuilder.class);

        JsonNode columns = object.path("columns");
        if (columns.isMissingNode() || columns.isNull()) {
            List<TypedName<?>> defColumns = entityDefaults != null ? entityDefaults.getDefaultColumns() : null;
            if (defColumns == null) {
                throw new IllegalArgumentException("no columns specified and no default columns available");
            }

            for (TypedName<?> attr : entityDefaults.getDefaultColumns()) {
                format.addColumn(attr);
            }
        } else if (columns.isObject()) {
            if (!canUseColumnMap) {
                throw new IllegalArgumentException("cannot use column map without file header");
            }
            Iterator<Map.Entry<String, JsonNode>> colIter = columns.fields();
            while (colIter.hasNext()) {
                Map.Entry<String, JsonNode> col = colIter.next();
                format.addColumn(col.getKey(), parseAttribute(entityDefaults, col.getValue()));
            }
        } else if (columns.isArray()) {
            for (JsonNode col : columns) {
                format.addColumn(parseAttribute(entityDefaults, col));
            }
        } else {
            throw new IllegalArgumentException("invalid format for columns");
        }

        JsonNode ebNode = object.path("builder");
        if (ebNode.isTextual()) {
            String ebName = ebNode.asText();
            if (ebName.equals("basic")) {
                format.setEntityBuilder(BasicEntityBuilder.class);
            } else {
                Class bld;
                try {
                    bld = ClassUtils.getClass(loader, ebName);
                } catch (ClassNotFoundException e) {
                    throw new IllegalArgumentException("cannot load class " + ebName, e);
                }
                format.setEntityBuilder(bld);
            }
        }
        logger.debug("{}: using entity builder {}", format.getEntityBuilder());

        source.setFormat(format);
        return source;
    }

    private static TypedName<?> parseAttribute(EntityDefaults entityDefaults, JsonNode col) {
        if (col.isNull() || col.isMissingNode()) {
            return null;
        } else if (col.isObject()) {
            String name = col.get("name").asText();
            String type = col.get("type").asText();
            return TypedName.create(name, type);
        } else if (col.isTextual()) {
            TypedName<?> attr = entityDefaults.getAttributeDefaults(col.asText());
            if (attr == null) {
                attr = TypedName.create(col.asText(), col.asText().equals("id") ? Long.class : String.class);
            }
            return attr;
        } else {
            throw new IllegalArgumentException("invalid attribute specification: " + col.toString());
        }
    }
}