io.mandrel.worker.WorkerContainer.java Source code

Java tutorial

Introduction

Here is the source code for io.mandrel.worker.WorkerContainer.java

Source

/*
 * Licensed to Mandrel under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Mandrel licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package io.mandrel.worker;

import io.mandrel.blob.BlobStore;
import io.mandrel.blob.BlobStores;
import io.mandrel.cluster.discovery.DiscoveryClient;
import io.mandrel.common.container.AbstractContainer;
import io.mandrel.common.container.ContainerStatus;
import io.mandrel.common.data.Spider;
import io.mandrel.common.service.TaskContext;
import io.mandrel.data.extract.ExtractorService;
import io.mandrel.document.DocumentStore;
import io.mandrel.document.DocumentStores;
import io.mandrel.metadata.MetadataStore;
import io.mandrel.metadata.MetadataStores;
import io.mandrel.metrics.Accumulators;
import io.mandrel.requests.Requester;
import io.mandrel.requests.Requesters;
import io.mandrel.transport.Clients;

import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.stream.IntStream;

import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.Accessors;
import lombok.extern.slf4j.Slf4j;

import org.apache.commons.lang3.concurrent.BasicThreadFactory;

import com.google.common.util.concurrent.Monitor;

@Slf4j
@Data
@EqualsAndHashCode(callSuper = false)
@Accessors(chain = true, fluent = true)
public class WorkerContainer extends AbstractContainer {

    private final Monitor monitor = new Monitor();
    private final TaskContext context = new TaskContext();
    private final ExtractorService extractorService;
    private final ScheduledExecutorService executor;
    private final List<Loop> loops;

    public WorkerContainer(ExtractorService extractorService, Accumulators accumulators, Spider spider,
            Clients clients, DiscoveryClient discoveryClient) {
        super(accumulators, spider, clients);
        context.setDefinition(spider);

        this.extractorService = extractorService;

        // Create the thread factory
        BasicThreadFactory threadFactory = new BasicThreadFactory.Builder()
                .namingPattern("worker-" + spider.getId() + "-%d").daemon(true).priority(Thread.MAX_PRIORITY)
                .build();

        // Get number of parallel loops
        int parallel = Runtime.getRuntime().availableProcessors();
        // Prepare a pool for the X parallel loops and the barrier refresh
        executor = Executors.newScheduledThreadPool(parallel + 1, threadFactory);

        // Prepare the barrier
        Barrier barrier = new Barrier(spider.getFrontier().getPoliteness(), discoveryClient);
        executor.scheduleAtFixedRate(() -> barrier.updateBuckets(), 10, 10, TimeUnit.SECONDS);

        // Create loop
        loops = new ArrayList<>(parallel);
        IntStream.range(0, parallel).forEach(idx -> {
            Loop loop = new Loop(extractorService, spider, clients, accumulators.spiderAccumulator(spider.getId()),
                    accumulators.globalAccumulator(), barrier);
            loops.add(loop);
            executor.submit(loop);
        });

        // Init stores
        MetadataStore metadatastore = spider.getStores().getMetadataStore().build(context);
        metadatastore.init();
        MetadataStores.add(spider.getId(), metadatastore);

        BlobStore blobStore = spider.getStores().getBlobStore().build(context);
        blobStore.init();
        BlobStores.add(spider.getId(), blobStore);

        if (spider.getExtractors().getData() != null) {
            spider.getExtractors().getData().forEach(ex -> {
                DocumentStore documentStore = ex.getDocumentStore().metadataExtractor(ex).build(context);
                documentStore.init();
                DocumentStores.add(spider.getId(), ex.getName(), documentStore);
            });
        }

        // Init requesters
        spider.getClient().getRequesters().forEach(r -> {
            Requester<?> requester = r.build(context);

            // Prepare client
            if (requester.strategy().nameResolver() != null) {
                requester.strategy().nameResolver().init();
            }
            if (requester.strategy().proxyServersSource() != null) {
                requester.strategy().proxyServersSource().init();
            }
            requester.init();

            Requesters.add(spider.getId(), requester);
        });

        current.set(ContainerStatus.INITIATED);
    }

    @Override
    public String type() {
        return "worker";
    }

    @Override
    public void start() {
        if (monitor.tryEnter()) {
            try {
                if (!current.get().equals(ContainerStatus.STARTED)) {
                    loops.forEach(loop -> loop.start());
                    current.set(ContainerStatus.STARTED);
                }
            } finally {
                monitor.leave();
            }
        }
    }

    @Override
    public void pause() {
        if (monitor.tryEnter()) {
            try {
                if (!current.get().equals(ContainerStatus.PAUSED)) {
                    loops.forEach(loop -> loop.pause());
                    current.set(ContainerStatus.PAUSED);
                }
            } finally {
                monitor.leave();
            }
        }
    }

    @Override
    public void kill() {
        if (monitor.tryEnter()) {
            try {
                if (!current.get().equals(ContainerStatus.KILLED)) {
                    loops.forEach(loop -> loop.pause());
                    try {
                        executor.shutdownNow();
                    } catch (Exception e) {
                        log.debug(e.getMessage(), e);
                    }

                    try {
                        MetadataStores.remove(spider.getId());
                    } catch (Exception e) {
                        log.debug(e.getMessage(), e);
                    }

                    try {
                        BlobStores.remove(spider.getId());
                    } catch (Exception e) {
                        log.debug(e.getMessage(), e);
                    }

                    try {
                        DocumentStores.remove(spider.getId());
                    } catch (Exception e) {
                        log.debug(e.getMessage(), e);
                    }

                    try {
                        Requesters.remove(spider.getId());
                    } catch (Exception e) {
                        log.debug(e.getMessage(), e);
                    }

                    accumulators.destroy(spider.getId());

                    current.set(ContainerStatus.KILLED);
                }
            } finally {
                monitor.leave();
            }
        }
    }

    @Override
    public void register() {
        WorkerContainers.add(spider.getId(), this);
    }

    @Override
    public void unregister() {
        WorkerContainers.remove(spider.getId());
    }
}