io.mandrel.worker.Loop.java Source code

Java tutorial

Introduction

Here is the source code for io.mandrel.worker.Loop.java

Source

/*
 * Licensed to Mandrel under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Mandrel licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package io.mandrel.worker;

import io.mandrel.blob.Blob;
import io.mandrel.blob.BlobStores;
import io.mandrel.common.data.Spider;
import io.mandrel.common.data.Strategy;
import io.mandrel.common.net.Uri;
import io.mandrel.data.Link;
import io.mandrel.data.content.selector.Selector.Instance;
import io.mandrel.data.extract.ExtractorService;
import io.mandrel.document.Document;
import io.mandrel.endpoints.contracts.Next;
import io.mandrel.metadata.MetadataStores;
import io.mandrel.metrics.GlobalAccumulator;
import io.mandrel.metrics.SpiderAccumulator;
import io.mandrel.requests.ConnectTimeoutException;
import io.mandrel.requests.ReadTimeoutException;
import io.mandrel.requests.Requester;
import io.mandrel.requests.Requesters;
import io.mandrel.transport.Clients;
import io.mandrel.transport.RemoteException;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Collectors;

import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;

import org.apache.commons.collections.CollectionUtils;
import org.springframework.util.StopWatch;

/**
 * Weeeeeeeeeeeeeeeeehhh!!
 */
@Slf4j
@RequiredArgsConstructor
public class Loop implements Runnable {

    private final ExtractorService extractorService;
    private final Spider spider;
    private final Clients clients;

    private final SpiderAccumulator spiderAccumulator;
    private final GlobalAccumulator globalAccumulator;

    private final Barrier barrier;

    private final AtomicBoolean run = new AtomicBoolean(false);

    public void start() {
        run.set(true);
    }

    public void pause() {
        run.set(false);
    }

    public boolean isRunning() {
        return run.get();
    }

    @Override
    public void run() {
        while (true) {

            try {
                if (!run.get()) {
                    log.trace("Waiting...");
                    try {
                        TimeUnit.MILLISECONDS.sleep(2000);
                    } catch (InterruptedException e) {
                        // Don't care
                        log.trace("", e);
                    }
                    continue;
                }

                log.trace("> Asking for uri...");
                Next next = clients.onRandomFrontier().map(frontier -> frontier.next(spider.getId())).get(20000,
                        TimeUnit.MILLISECONDS);
                Uri uri = next.getUri();

                if (uri != null) {

                    log.trace("> Getting uri {} !", uri);

                    //
                    StopWatch watch = new StopWatch();
                    watch.start();

                    //
                    Optional<Requester<? extends Strategy>> requester = Requesters.of(spider.getId(),
                            uri.getScheme());
                    if (requester.isPresent()) {
                        Requester<? extends Strategy> r = requester.get();

                        Blob blob = null;
                        try {
                            blob = processBlob(uri, watch, r);
                        } catch (Exception t) {
                            // TODO create and use internal exception instead...
                            if (t instanceof ConnectTimeoutException) {
                                spiderAccumulator.incConnectTimeout();
                                add(spider.getId(), uri);
                            } else if (t instanceof ReadTimeoutException) {
                                spiderAccumulator.incReadTimeout();
                                add(spider.getId(), uri);
                            } else {
                                log.debug("Error while looping", t);
                            }
                        } finally {
                            barrier.passOrWait(
                                    blob != null && blob.getMetadata() != null ? blob.getMetadata().getSize()
                                            : null);
                        }
                    } else {
                        // TODO Unknown protocol
                        log.debug("Unknown protocol, can not find requester for '{}'", uri.getScheme());
                    }
                } else {
                    log.trace("Frontier returned null Uri, waiting");
                    try {
                        TimeUnit.MILLISECONDS.sleep(10000);
                    } catch (InterruptedException e) {
                        // Don't care
                        log.trace("", e);
                    }
                }
            } catch (RemoteException e) {
                switch (e.getError()) {
                case G_UNKNOWN:
                    log.warn("Got a problem, waiting 2 sec...", e);
                    try {
                        TimeUnit.MILLISECONDS.sleep(2000);
                    } catch (InterruptedException ie) {
                        // Don't care
                        log.trace("", ie);
                    }
                }
            } catch (Exception e) {
                log.warn("Got a problem, waiting 2 sec...", e);
                try {
                    TimeUnit.MILLISECONDS.sleep(2000);
                } catch (InterruptedException ie) {
                    // Don't care
                    log.trace("", ie);
                }
            }
        }
    }

    protected Blob processBlob(Uri uri, StopWatch watch, Requester<? extends Strategy> r) throws Exception {
        Blob blob;
        blob = r.get(uri);

        watch.stop();

        log.trace("> Start parsing data for {}", uri);

        blob.getMetadata().getFetchMetadata().setTimeToFetch(watch.getTotalTimeMillis());

        updateMetrics(watch, blob);

        Map<String, Instance<?>> cachedSelectors = new HashMap<>();
        if (spider.getExtractors() != null && spider.getExtractors().getData() != null) {
            log.trace(">  - Extracting documents for {}...", uri);
            spider.getExtractors().getData().forEach(ex -> {
                List<Document> documents = extractorService.extractThenFormatThenStore(spider.getId(),
                        cachedSelectors, blob, ex);

                if (documents != null) {
                    spiderAccumulator.incDocumentForExtractor(ex.getName(), documents.size());
                }
            });
            log.trace(">  - Extracting documents for {} done!", uri);
        }

        if (spider.getExtractors().getOutlinks() != null) {
            log.trace(">  - Extracting outlinks for {}...", uri);
            final Uri theUri = uri;
            spider.getExtractors().getOutlinks().forEach(ol -> {
                Set<Link> allFilteredOutlinks = extractorService
                        .extractAndFilterOutlinks(spider, theUri, cachedSelectors, blob, ol).getRight();
                blob.getMetadata().getFetchMetadata().setOutlinks(allFilteredOutlinks);
                add(spider.getId(), allFilteredOutlinks.stream().map(l -> l.getUri()).collect(Collectors.toSet()));
            });
            log.trace(">  - Extracting outlinks done for {}!", uri);
        }

        BlobStores.get(spider.getId()).ifPresent(b -> b.putBlob(blob.getMetadata().getUri(), blob));

        log.trace(">  - Storing metadata for {}...", uri);
        MetadataStores.get(spider.getId()).addMetadata(blob.getMetadata().getUri(), blob.getMetadata());
        log.trace(">  - Storing metadata for {} done!", uri);

        log.trace("> End parsing data for {}", uri);
        return blob;
    }

    protected void updateMetrics(StopWatch watch, Blob blob) {
        spiderAccumulator.incNbPages();
        globalAccumulator.incNbPages();

        spiderAccumulator.incPageForStatus(blob.getMetadata().getFetchMetadata().getStatusCode());
        globalAccumulator.incPageForStatus(blob.getMetadata().getFetchMetadata().getStatusCode());

        spiderAccumulator.incPageForHost(blob.getMetadata().getUri().getHost());
        globalAccumulator.incPageForHost(blob.getMetadata().getUri().getHost());

        spiderAccumulator.incTotalTimeToFetch(watch.getLastTaskTimeMillis());

        if (blob.getMetadata().getSize() != null) {
            spiderAccumulator.incTotalSize(blob.getMetadata().getSize());
            globalAccumulator.incTotalSize(blob.getMetadata().getSize());
        }
    }

    protected void add(long spiderId, Set<Uri> uris) {
        if (CollectionUtils.isNotEmpty(uris)) {
            clients.onRandomFrontier().with(frontier -> frontier.mschedule(spiderId, uris));
        }
    }

    protected void add(long spiderId, Uri uri) {
        if (uri != null) {
            clients.onRandomFrontier().with(frontier -> frontier.schedule(spiderId, uri));
        }
    }
}