org.balloon_project.overflight.task.indexing.IndexingTask.java Source code

Java tutorial

Introduction

Here is the source code for org.balloon_project.overflight.task.indexing.IndexingTask.java

Source

/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2014 Distributed
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package org.balloon_project.overflight.task.indexing;

import com.google.common.base.Stopwatch;
import com.hp.hpl.jena.query.QueryExecutionFactory;
import com.hp.hpl.jena.query.QuerySolution;
import com.hp.hpl.jena.query.ResultSet;
import com.hp.hpl.jena.sparql.engine.http.QueryEngineHTTP;
import org.balloon_project.Configuration;
import org.balloon_project.overflight.model.Endpoint;
import org.balloon_project.overflight.model.RelEntity;
import org.balloon_project.overflight.model.Triple;
import org.balloon_project.overflight.service.EndpointService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Component;

import java.io.*;
import java.net.UnknownHostException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;

@Component
@Scope(value = "prototype")
public class IndexingTask implements Runnable {

    @Autowired
    Configuration configuration;

    @Autowired
    private EndpointService endpointService;

    private Endpoint endpoint;
    private RelEntity relEntity;
    private String filename;
    private File file;
    private Logger logger = LoggerFactory.getLogger(IndexingTask.class);

    public static final String N_TRIPLES_EXTENSION = ".nt";
    public static final String BEGIN_DATA = "# BEGIN DATA";
    public static final String KEY_ENDPOINT = "Endpoint";
    public static final String KEY_PREDICATE = "Predicate";
    public static final String KEY_DATE = "Date";
    public static final String FILE_NAME_IN_ZIP = "dump.n3";
    public static final String DUMP_COMMENT_PREFIX = "#";

    public IndexingTask() {
        // WORKAROUND: spring doesn't allow lookup methods with arguments. Therefore you have to call the init method after creation until this bug has been fixed.
    }

    public void init(Endpoint endpoint, RelEntity relEntity) {
        this.endpoint = endpoint;
        this.relEntity = relEntity;
        this.filename = endpoint.getEndpointID() + "_" + relEntity.getShortname() + N_TRIPLES_EXTENSION;
        try {
            this.file = getOrCreateDumpFile();
        } catch (IOException e) {
            throw new IllegalStateException("couldn't get access to filesystem");
        }
    }

    @Override
    public void run() {
        // TODO check if already running?

        String query = Statements.getInstace().getSameAsSparqlQuery(relEntity);
        int limit = configuration.getSparqlInitialQueryLimit();
        int offset = 0;
        try {
            offset = getOffset();
        } catch (IOException e) {
            logger.error("File access error");
            e.printStackTrace();
            return;
        }

        // reflect the starting status
        endpointService.setStatusStarted(this.endpoint, relEntity);
        logger.info("START indexing on endpoint " + endpoint.getEndpointID() + " with predicate "
                + relEntity.getPredicate() + " offset=" + offset);

        // start querying the endpoint for the specified predicate
        // fallback on simple query if there is a error (e.g. subquerries not allowed)
        // but fallback should be considered only once --> fallback boolean
        boolean fallback = false;
        boolean finished = false;
        try {
            while (!finished) {
                try {

                    List<Triple> results = queryEndpoint(query, limit, offset);
                    dumpIndexedTriples(results);

                    // update query configuration for next iteration and persist them
                    offset += results.size();
                    limit = results.size();
                    finished = results.size() == 0;

                } catch (Throwable e) {
                    // consider fallback only once. update query if this wasn't the fallback already
                    if (fallback) {
                        throw e;
                    } else {
                        logger.debug(this.endpoint.getEndpointID()
                                + ": ERROR --> Fallback to simple query predicate " + relEntity.getPredicate());
                        query = Statements.getInstace().getSimpleSameAsSparqlQuery(relEntity);
                        fallback = true;
                    }
                }
            }

            // the stored dump file will be prepared for importing in database and made available for public download
            finalizeDump();
            endpointService.setStatusFinished(endpoint, relEntity);
            logger.info(this.endpoint.getEndpointID() + ": Indexing service CRAWLED - " + offset
                    + " entries indexed with predicate " + relEntity.getPredicate());
        } catch (Throwable e) {
            e.printStackTrace();
            // catch different exceptions to detect reconsideration or exclusion of endpoints
            handleQueryException(e, offset, limit);
        }
    }

    private void handleQueryException(Throwable e, int offset, int limit) {
        if (e instanceof UnknownHostException
                || e.getMessage() != null && (e.getMessage().contains("UnknownHostException")
                        || e.getMessage().contains("Failed to connect to remote server"))) {
            // exclude endpoint if endpoint is not available
            endpointService.updateStatus(this.endpoint, relEntity, IndexingStatus.EXCLUDED,
                    "endpoint not available");
            logger.error(this.endpoint.getEndpointID() + ": " + KEY_ENDPOINT
                    + " unreachable. Exclude endpoint from indexing with predicate " + relEntity.getPredicate());

        } else if (e.getMessage() != null && e.getMessage().contains("Request forbidden")) {
            // exclude endpoint if endpoint is not public
            endpointService.updateStatus(this.endpoint, relEntity, IndexingStatus.EXCLUDED, "access forbidden");
            logger.error(this.endpoint.getEndpointID()
                    + ": Access forbidden. Exclude endpoint from indexing with predicate "
                    + relEntity.getPredicate());

        } else {
            // reconsider endpoint if any other error occurred
            endpointService.updateStatus(this.endpoint, relEntity, IndexingStatus.ERROR, e.getMessage());
            logger.error(this.endpoint.getEndpointID() + ": ERROR with predicate " + relEntity.getPredicate()
                    + " Details: " + e.getClass() + " - " + e.getMessage());
        }
    }

    private List<Triple> queryEndpoint(String query, int limit, int offset) throws Exception {
        Stopwatch queryTimer = Stopwatch.createStarted();
        query += " LIMIT " + limit;

        if (offset > 0) {
            query += " OFFSET " + offset;
        }

        logger.debug("Querying " + endpoint.getEndpointID() + ": " + query);

        List<Triple> results = new ArrayList<>();
        QueryEngineHTTP qeHTTP = (QueryEngineHTTP) QueryExecutionFactory
                .sparqlService(this.endpoint.getSparqlEndpoint(), query);
        qeHTTP.setTimeout(configuration.getSPARQLReadTimeout(), configuration.getSPARQLConnectTimeout());

        try {
            ResultSet rs = qeHTTP.execSelect();
            while (rs.hasNext()) {
                QuerySolution result = rs.next();

                if (result.get("s") != null && result.get("s").isResource() && result.get("o") != null
                        && result.get("o").isResource()) {
                    Triple item = new Triple(this.endpoint, result.get("s").toString(), relEntity,
                            result.get("o").toString());
                    results.add(item);
                }
            }

            qeHTTP.close();
            logger.info(this.endpoint.getEndpointID() + ": " + results.size() + " items retrieved => Total: "
                    + offset + " (query duration: " + queryTimer.stop().elapsed(TimeUnit.MILLISECONDS)
                    + " ms) predicate " + relEntity.getPredicate());

            return results;
        } catch (Exception e) {
            qeHTTP.abort();
            throw e;
        }
    }

    private void dumpIndexedTriples(List<Triple> results) throws IOException {
        Stopwatch storeTimer = Stopwatch.createStarted();

        if (results.size() > 0) {
            BufferedWriter writer = new BufferedWriter(new FileWriter(file, true));

            try {
                for (Triple triple : results) {
                    StringBuilder sb = new StringBuilder();
                    sb.append("<").append(triple.getSubject()).append("> <")
                            .append(triple.getRelEntity().getPredicate()).append("> <").append(triple.getObject())
                            .append(">.\n");
                    writer.append(sb.toString());
                }
            } finally {
                writer.flush();
                writer.close();
            }
        }

        logger.debug(this.endpoint.getEndpointID() + ": Intermediate result persisted (Size = " + results.size()
                + ") Continue query process" + " (save duration: "
                + storeTimer.stop().elapsed(TimeUnit.MILLISECONDS) + " ms) predicate " + relEntity.getPredicate());
    }

    private File getOrCreateDumpFile() throws IOException {
        // create file if it not already exists
        File dir = new File(configuration.getTripleDumpStoreDirectory());
        dir.mkdirs();
        File dumpFile = new File(dir, filename);
        boolean newFile = dumpFile.createNewFile();

        // adding header to new file
        if (newFile) {
            BufferedWriter writer = new BufferedWriter(new FileWriter(dumpFile));
            try {
                SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
                writer.append(
                        "# ---------------------------------------------------------------------------------------------------------------------------\n");
                writer.append(
                        "# Triples indexed by Balloon Overflight (https://www.dimis.fim.uni-passau.de/balloon/)\n");
                writer.append("# " + KEY_ENDPOINT + ": ").append(endpoint.getEndpointID()).append("\n");
                writer.append("# URL: ").append(endpoint.getUrl()).append("\n");
                writer.append("# SPARQL: ").append(endpoint.getSparqlEndpoint()).append("\n");
                writer.append("# Information: http://datahub.io/dataset/").append(endpoint.getEndpointID())
                        .append("\n");
                writer.append("# " + KEY_PREDICATE + ": ").append(relEntity.getPredicate()).append("\n");
                writer.append("# PredicateIdentifier: ").append(relEntity.getShortname()).append("\n");
                writer.append("# PredicateType: ").append(relEntity.getType().toString()).append("\n");
                writer.append("# " + KEY_DATE + ": ").append(dateFormat.format(new Date())).append("\n");
                writer.append(
                        "# ---------------------------------------------------------------------------------------------------------------------------\n");
                writer.append(BEGIN_DATA);
                writer.append("\n");
            } finally {
                writer.flush();
                writer.close();
            }
        }

        return dumpFile;
    }

    private void finalizeDump() throws IOException {
        // move file to import directory
        File importDir = new File(configuration.getDatabaseImportDirectory());
        File destination = new File(importDir, filename);
        importDir.mkdirs();
        file.renameTo(destination);

        // Zip the file
        //            logger.debug("Compressing the dump for the sameAs-Statements of endpoint " + endpoint.getEndpointID());
        //            // create a temporary file path but do not create a file
        //            File tempZipFile = new File(System.getProperty("java.io.tmpdir") + UUID.randomUUID() + ".zip");
        //
        //            try {
        //                ZipFile zip = new ZipFile(tempZipFile);
        //                ZipParameters parameters = new ZipParameters();
        //                parameters.setCompressionMethod(Zip4jConstants.COMP_DEFLATE);
        //                parameters.setCompressionLevel(Zip4jConstants.DEFLATE_LEVEL_NORMAL);
        //                parameters.setFileNameInZip(FILE_NAME_IN_ZIP);
        //                parameters.setSourceExternalStream(true);
        //                zip.createZipFile(file, parameters);
        //
        //                // upload zip file to ftp server
        //                logger.debug("Upload compressed dump for the sameAs-Statements of endpoint " + endpoint.getEndpointID());
        //                FileSystemManager fsManager = VFS.getManager();
        //                FileSystemOptions opts = new FileSystemOptions();
        //                FtpFileSystemConfigBuilder.getInstance().setPassiveMode(opts, true);
        //
        //                FileObject sourceFile = null;
        //                FileObject destinationFile = null;
        //                try {
        //                    sourceFile = fsManager.toFileObject(zip.getFile());
        //                    String destinationPath = FTP_CONNECTION + endpoint.getEndpointID() + "_" + dateString + "_" + System.currentTimeMillis() + ".zip";
        //                    destinationFile = fsManager.resolveFile(destinationPath, opts);
        //                    destinationFile.copyFrom(sourceFile, new AllFileSelector());
        //                } finally {
        //                    sourceFile.close();
        //                    destinationFile.close();
        //                }
        //
        //            } finally {
        //                if (tempZipFile.exists()) {
        //                    tempZipFile.delete();
        //                }
        //            }
    }

    private int getOffset() throws IOException {
        BufferedReader reader = new BufferedReader(new FileReader(file));
        int tripleCount = 0;
        String line;

        while ((line = reader.readLine()) != null) {
            if (!line.trim().startsWith(DUMP_COMMENT_PREFIX)) {
                tripleCount++;
            }
        }

        reader.close();
        return tripleCount;
    }
}