es.upv.grycap.coreutils.fiber.http.HttpDataFetcher.java Source code

Introduction

Here is the source code for es.upv.grycap.coreutils.fiber.http.HttpDataFetcher.java
Source

/*
 * Core Utils - Fiber-enabled clients.
 * Copyright 2015-2016 GRyCAP (Universitat Politecnica de Valencia)
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * This product combines work with different licenses. See the "NOTICE" text
 * file for details on the various modules and licenses.
 * 
 * The "NOTICE" text file is part of the distribution. Any derivative works
 * that you distribute must include a readable copy of the "NOTICE" text file.
 */

package es.upv.grycap.coreutils.fiber.http;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableMap.of;
import static es.upv.grycap.coreutils.fiber.net.UrlBuilder.getUrlBuilder;
import static java.nio.file.StandardCopyOption.REPLACE_EXISTING;
import static java.util.Collections.synchronizedList;
import static java.util.Objects.requireNonNull;
import static java.util.Optional.ofNullable;
import static org.apache.commons.lang3.StringUtils.removeEnd;
import static org.apache.commons.lang3.StringUtils.trimToNull;
import static org.slf4j.LoggerFactory.getLogger;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Future;
import java.util.stream.Collectors;

import javax.annotation.Nullable;

import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.utils.URIUtils;
import org.apache.http.concurrent.FutureCallback;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.nio.client.CloseableHttpAsyncClient;
import org.apache.http.impl.nio.client.HttpAsyncClients;
import org.apache.http.message.BasicHttpRequest;
import org.apache.http.nio.client.methods.ZeroCopyConsumer;
import org.apache.http.nio.protocol.BasicAsyncRequestProducer;
import org.apache.http.nio.protocol.HttpAsyncRequestProducer;
import org.slf4j.Logger;

import com.google.common.collect.Range;

import co.paralleluniverse.fibers.httpasyncclient.FiberCloseableHttpAsyncClient;
import es.upv.grycap.coreutils.fiber.net.UrlBuilder;

/**
 * High-concurrency data fetcher. Note that this class will name fetched files using a convention-over-configuration pattern 
 * where the suffix <tt>.partial</tt> is appended to the filenames to differentiate between files completely retrieved and 
 * files being fetched. Once the download complete, the file is moved to its final destination, by simply removing the suffix
 * from the filename and renaming the file. 
 * @author Erik Torres
 * @since 0.1.0
 */
public class HttpDataFetcher {

    private static final Logger LOGGER = getLogger(HttpDataFetcher.class);

    /**
     * Maximum allowed concurrency. Values above this limit will be trimmed. 
     */
    public static final int MAX_CONCURRENCY = 32;

    private final int concurrencyLevel;

    /**
     * Convenient constructor that allows creating a new instance of this class setting the value of the concurrency level.
     * Values above this {@link HttpDataFetcher#MAX_CONCURRENCY limit} will be trimmed.
     * @param concurrencyLevel - desired concurrency level
     */
    public HttpDataFetcher(final int concurrencyLevel) {
        this.concurrencyLevel = Range.closed(1, MAX_CONCURRENCY).contains(concurrencyLevel) ? concurrencyLevel
                : MAX_CONCURRENCY;
        LOGGER.info(new StringBuilder("Concurrency level: ").append(this.concurrencyLevel).toString());
    }

    /**
     * Specialization of the method {@link HttpDataFetcher#fetchToDir(URL, String, List, String, String, File)} that allows fetching
     * and saving a bunch of objects to the specified directory from a server that uses a REST or REST-like API where each object 
     * is retrieved from the URL formed appending the object's identifier to the path of the the base URL.
     * @param baseUrl - base URL from where the objects will be fetched
     * @param ids - a list with the identifiers of the all requests that will be attempted
     * @param outdir - directory where the files will be stored
     * @return A {@link CompletableFuture} that allows cancellation. Once each fetch operation is completed, its status is updated
     *         in the future with one of the possible values provided by the enumeration {@link FetchStatus}.
     * @throws IOException If an error occurs during the execution of the method that prevents fetching or saving the files.
     */
    public FecthFuture fetchToDir(final URL baseUrl, final List<String> ids, final File outdir) throws IOException {
        return fetchToDir(baseUrl, null, ids, null, null, outdir);
    }

    /**
     * Allows fetching and saving a bunch of objects to the specified directory from a server that uses a REST or REST-like API 
     * where each object is retrieved from the URL formed appending the object's identifier to the path of the the base URL, and 
     * optionally from a server that uses a parameter to identify the objects. Supports additional configuration options to name
     * the fetched objects.
     * @param baseUrl - base URL from where the objects will be fetched
     * @param queryParam - if defined, a query parameter will be appended to the base URL with the identifier of the request
     * @param ids - a list with the identifiers of the all requests that will be attempted
     * @param prefix - optionally prepend this prefix to the filenames of the saved files
     * @param suffix - optionally append this suffix to the filenames of the saved files
     * @param outdir - directory where the files will be stored
     * @return A {@link CompletableFuture} that allows cancellation. Once each fetch operation is completed, its status is updated
     *         in the future with one of the possible values provided by the enumeration {@link FetchStatus}.
     * @throws IOException If an error occurs during the execution of the method that prevents fetching or saving the files.
     */
    public FecthFuture fetchToDir(final URL baseUrl, final @Nullable String queryParam, final List<String> ids,
            final @Nullable String prefix, final @Nullable String suffix, final File outdir) throws IOException {
        // check mandatory parameters
        requireNonNull(baseUrl, "A valid URL expected");
        final FecthFuture toBeCompleted = new FecthFuture(
                requireNonNull(ids, "A valid list of identifiers expected").stream().map(StringUtils::trimToNull)
                        .filter(Objects::nonNull).distinct().collect(Collectors.toList()));
        requireNonNull(outdir, "A valid output directory expected");
        checkArgument((outdir.isDirectory() && outdir.canWrite()) || outdir.mkdirs(),
                new StringBuilder("Cannot write to the output directory: ").append(outdir.getAbsolutePath())
                        .toString());
        // get optional parameters
        final Optional<String> queryParam2 = ofNullable(trimToNull(queryParam));
        final String prefix2 = ofNullable(prefix).orElse("");
        final String suffix2 = ofNullable(suffix).orElse("");
        try (final CloseableHttpAsyncClient asyncHttpClient = createFiberCloseableHttpAsyncClient()) {
            asyncHttpClient.start();
            final UrlBuilder urlBuilder = getUrlBuilder(baseUrl);
            // an explanation is needed since this code is instrumented by Quasar and Comsat: requests are created during the first part of
            // this lambda expression (map), but they are not executed until the get() method is called in the second part of the expression
            // (forEach). Here that parallel stream is used to block and wait for the requests to complete. In case that a single stream is
            // used, each request will be created and executed sequentially. Therefore, the alternative to parallel stream is to separate
            // the lambda expression in two loops, creating the requests in the first loop and calling get() in the second one.
            toBeCompleted.monList.parallelStream().map(m -> {
                try {
                    // create output file
                    final File outfile = new File(outdir,
                            new StringBuilder(prefix2).append(m.id).append(suffix2).append(".partial").toString());
                    checkState(outfile.createNewFile(), new StringBuilder("Cannot create the output file: ")
                            .append(outfile.getAbsolutePath()).toString());
                    // create the HTTP request               
                    final HttpHost target = URIUtils.extractHost(baseUrl.toURI());
                    final HttpRequest request = new BasicHttpRequest("GET",
                            urlBuilder.buildRelativeUrl(queryParam2.isPresent() ? null : m.id,
                                    queryParam2.isPresent() ? of(queryParam2.get(), m.id) : null));
                    final HttpAsyncRequestProducer producer = new BasicAsyncRequestProducer(target, request);
                    // create the consumer
                    final ZeroCopyConsumer<File> consumer = new ZeroCopyConsumer<File>(outfile) {
                        @Override
                        protected File process(final HttpResponse response, final File file,
                                final ContentType contentType) throws Exception {
                            final StatusLine status = response.getStatusLine();
                            if (LOGGER.isDebugEnabled())
                                LOGGER.debug(
                                        new StringBuilder("Got file: statusCode=").append(status.getStatusCode())
                                                .append(", file=").append(file.getAbsolutePath()).toString());
                            if (status.getStatusCode() != HttpStatus.SC_OK)
                                throw new ClientProtocolException(
                                        new StringBuilder("Object fetch failed: ").append(status).toString());
                            return file;
                        }
                    };
                    // prepare request
                    m.future = asyncHttpClient.execute(producer, consumer, new FutureCallback<File>() {
                        @Override
                        public void cancelled() {
                            toBeCompleted.update(m.id, FetchStatus.CANCELLED);
                            LOGGER.info("Task cancelled");
                        }

                        @Override
                        public void completed(final File result) {
                            try {
                                final Path path = result.toPath();
                                Files.move(path, path.resolveSibling(removeEnd(result.getName(), ".partial")),
                                        REPLACE_EXISTING);
                                toBeCompleted.update(m.id, FetchStatus.COMPLETED);
                            } catch (IOException ex) {
                                toBeCompleted.update(m.id, FetchStatus.FAILED);
                                LOGGER.error("Fecth failed to move file to its final destination with error", ex);
                            }
                        }

                        @Override
                        public void failed(final Exception ex) {
                            toBeCompleted.update(m.id, FetchStatus.FAILED);
                            LOGGER.error("Fecth failed with error", ex);
                        }
                    });
                } catch (Exception e) {
                    LOGGER.error(new StringBuilder("Failed to fetch object with id: ").append(m.id).toString(), e);
                }
                return m;
            }).forEach(m -> {
                try {
                    // submit requests and wait for completion
                    m.future.get();
                } catch (Exception ignore) {
                    /* exceptions are handled in the callback functions */ }
            });
        }
        return toBeCompleted;
    }

    private CloseableHttpAsyncClient createFiberCloseableHttpAsyncClient() {
        return FiberCloseableHttpAsyncClient.wrap(HttpAsyncClients.custom().setMaxConnPerRoute(concurrencyLevel)
                .setMaxConnTotal(concurrencyLevel).build());
    }

    /**
     * Extends Java 8 {@link CompletableFuture} with real cancellation that will attempt to cancel the requests managed by this future.
     * @author Erik Torres
     * @since 0.0.1
     * @see <a href="https://dzone.com/articles/completablefuture-cant-be">CompletableFuture can't be Interrupted</a>
     */
    public static class FecthFuture extends CompletableFuture<Map<String, FetchStatus>> {

        private final List<FetchMonitor> monList;

        public FecthFuture(final List<String> ids) {
            this.monList = synchronizedList(requireNonNull(ids, "A non-empty list expected").stream()
                    .map(FetchMonitor::new).collect(Collectors.toList()));
            checkArgument(!monList.isEmpty(), "A non-empty list expected");
        }

        public void update(final String id, final FetchStatus newStatus) {
            monList.stream().filter(m -> m.id.equals(id)).findFirst().ifPresent(m -> {
                m.status = requireNonNull(newStatus, "A non-empty status expected");
            });
            if (!monList.stream().anyMatch(m -> FetchStatus.PENDING.equals(m.status))) {
                complete(monList.stream().collect(Collectors.toMap(FetchMonitor::getId, FetchMonitor::getStatus)));
            }
        }

        @Override
        public boolean cancel(final boolean mayInterruptIfRunning) {
            monList.stream().forEach(m -> {
                try {
                    m.future.cancel(true);
                } catch (Exception ignore) {
                }
            });
            return super.cancel(mayInterruptIfRunning);
        }

    }

    /**
     * Monitors fetch operation status.
     * @author Erik Torres
     * @since 0.0.1
     */
    private static class FetchMonitor {

        private final String id;
        private Future<File> future;
        private FetchStatus status = FetchStatus.PENDING;

        public FetchMonitor(final String id) {
            this.id = id;
        }

        public String getId() {
            return id;
        }

        public FetchStatus getStatus() {
            return status;
        }

    }

    /**
     * Possible status after object fetching.
     * @author Erik Torres
     * @since 0.0.1
     */
    public enum FetchStatus {
        PENDING, COMPLETED, CANCELLED, FAILED
    }

}