Java tutorial
/* * Licensed to Mandrel under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Mandrel licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package io.mandrel.spider; import io.mandrel.cluster.discovery.DiscoveryClient; import io.mandrel.cluster.discovery.ServiceIds; import io.mandrel.cluster.discovery.ServiceInstance; import io.mandrel.cluster.instance.StateService; import io.mandrel.common.MandrelException; import io.mandrel.common.NotFoundException; import io.mandrel.common.data.Spider; import io.mandrel.common.data.SpiderStatuses; import io.mandrel.common.service.TaskContext; import io.mandrel.common.sync.SyncRequest; import io.mandrel.common.sync.SyncResponse; import io.mandrel.data.filters.link.AllowedForDomainsFilter; import io.mandrel.data.filters.link.SkipAncorFilter; import io.mandrel.data.filters.link.UrlPatternFilter; import io.mandrel.data.source.FixedSource.FixedSourceDefinition; import io.mandrel.data.source.Source; import io.mandrel.data.validation.Validators; import io.mandrel.metrics.Accumulators; import io.mandrel.metrics.MetricsService; import io.mandrel.timeline.Event; import io.mandrel.timeline.Event.SpiderInfo.SpiderEventType; import io.mandrel.timeline.TimelineService; import io.mandrel.transport.Clients; import java.net.URI; import java.time.LocalDateTime; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; import javax.annotation.PostConstruct; import lombok.extern.slf4j.Slf4j; import org.apache.commons.collections.CollectionUtils; import org.kohsuke.randname.RandomNameGenerator; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.domain.Page; import org.springframework.data.domain.Pageable; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Component; import org.springframework.validation.BindException; import org.springframework.validation.BindingResult; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Throwables; import com.google.common.util.concurrent.Monitor; @Component @Slf4j public class SpiderService { @Autowired private SpiderRepository spiderRepository; @Autowired private TimelineService timelineService; @Autowired private Clients clients; @Autowired private DiscoveryClient discoveryClient; @Autowired private StateService stateService; @Autowired private Accumulators accumulators; @Autowired private MetricsService metricsService; @Autowired private ObjectMapper objectMapper; private final RandomNameGenerator generator = new RandomNameGenerator(); private final Monitor monitor = new Monitor(); @PostConstruct public void init() { // TODO Load the journal of commands } @Scheduled(fixedRate = 2000) public void sync() { if (stateService.isStarted()) { // TODO HOW TO in case of multiple controller // -> Acquiring distributed lock if (monitor.tryEnter()) { try { log.trace("Syncing the nodes from the controller..."); SyncRequest sync = new SyncRequest(); // Load the existing spiders from the database List<Spider> spiders = spiderRepository.listActive(); if (CollectionUtils.isNotEmpty(spiders)) { sync.setDefinitions(spiders.stream().map(spider -> { try { return objectMapper.writeValueAsBytes(spider); } catch (Exception e) { throw Throwables.propagate(e); } }).collect(Collectors.toList())); } // Sync first the controllers discoveryClient.getInstances(ServiceIds.controller()).forEach(instance -> { log.trace("Syncing controller {}", instance); try { SyncResponse response = clients.onController(instance.getHostAndPort()) .map(controller -> controller.syncControllers(sync)); if (response.anyAction()) { log.debug( "On controller {}:{}, after sync: {} created, {} updated, {} killed, {} started, {} paused", instance.getHost(), instance.getPort(), response.getCreated(), response.getUpdated(), response.getKilled(), response.getPaused()); } } catch (Exception e) { log.warn("Can not sync controller {}:{} due to: {}", instance.getHost(), instance.getPort(), e.getMessage()); } }); // And then the frontiers discoveryClient.getInstances(ServiceIds.frontier()).forEach(instance -> { log.trace("Syncing frontier {}", instance); try { SyncResponse response = clients.onFrontier(instance.getHostAndPort()) .map(frontier -> frontier.syncFrontiers(sync)); if (response.anyAction()) { log.debug( "On frontier {}:{}, after sync: {} created, {} updated, {} killed, {} started, {} paused", instance.getHost(), instance.getPort(), response.getCreated(), response.getUpdated(), response.getKilled(), response.getPaused()); } } catch (Exception e) { log.warn("Can not sync frontier {}:{} due to: {}", instance.getHost(), instance.getPort(), e.getMessage()); } }); // And then the workers discoveryClient.getInstances(ServiceIds.worker()).forEach(instance -> { log.trace("Syncing worker {}", instance); try { SyncResponse response = clients.onWorker(instance.getHostAndPort()) .map(worker -> worker.syncWorkers(sync)); if (response.anyAction()) { log.debug( "On frontier {}:{}, after sync: {} created, {} updated, {} killed, {} started, {} paused", instance.getHost(), instance.getPort(), response.getCreated(), response.getUpdated(), response.getKilled(), response.getPaused()); } } catch (Exception e) { log.warn("Can not sync worker {}:{} due to: {}", instance.getHost(), instance.getPort(), e.getMessage()); } }); } finally { monitor.leave(); } } } } public Spider update(Spider spider) throws BindException { BindingResult errors = Validators.validate(spider); if (errors.hasErrors()) { errors.getAllErrors().stream().forEach(oe -> log.info(oe.toString())); throw new BindException(errors); } updateTimeline(spider, SpiderEventType.SPIDER_UPDATED); return spiderRepository.update(spider); } public void updateTimeline(Spider spider, SpiderEventType status) { Event event = Event.forSpider(); event.getSpider().setSpiderId(spider.getId()).setSpiderName(spider.getName()).setType(status); timelineService.add(event); } /** * Create a new spider from a fixed list of urls. * * @param urls * @return */ public Spider add(List<String> urls) throws BindException { Spider spider = new Spider(); spider.setName(generator.next()); // Add source FixedSourceDefinition source = new FixedSourceDefinition(); source.setUrls(urls); spider.setSources(Arrays.asList(source)); // Add filters spider.getFilters().getLinks().add(new AllowedForDomainsFilter().domains(urls.stream().map(url -> { return URI.create(url).getHost(); }).collect(Collectors.toList()))); spider.getFilters().getLinks().add(new SkipAncorFilter()); spider.getFilters().getLinks().add(UrlPatternFilter.STATIC); return add(spider); } public long fork(long id) throws BindException { Spider spider = get(id); spider.setId(0); spider.setName(generator.next()); cleanDates(spider); BindingResult errors = Validators.validate(spider); if (errors.hasErrors()) { errors.getAllErrors().stream().forEach(oe -> log.info(oe.toString())); throw new BindException(errors); } spider.setStatus(SpiderStatuses.INITIATED); spider.setCreated(LocalDateTime.now()); spider = spiderRepository.add(spider); return spider.getId(); } public void cleanDates(Spider spider) { spider.setCreated(null); spider.setDeleted(null); spider.setEnded(null); spider.setKilled(null); spider.setPaused(null); spider.setStarted(null); } public Spider add(Spider spider) throws BindException { BindingResult errors = Validators.validate(spider); if (errors.hasErrors()) { errors.getAllErrors().stream().forEach(oe -> log.info(oe.toString())); throw new BindException(errors); } spider.setStatus(SpiderStatuses.INITIATED); spider.setCreated(LocalDateTime.now()); spider = spiderRepository.add(spider); updateTimeline(spider, SpiderEventType.SPIDER_CREATED); return spider; } public Spider get(long id) { return spiderRepository.get(id).orElseThrow(() -> new NotFoundException("Spider not found")); } public Page<Spider> page(Pageable pageable) { return spiderRepository.page(pageable); } public List<Spider> listActive() { return spiderRepository.listActive(); } public List<Spider> listLastActive(int limit) { return spiderRepository.listLastActive(limit); } public void reinject(long spiderId) { Spider spider = get(spiderId); injectSingletonSources(spider); } public void start(long spiderId) { Spider spider = get(spiderId); if (SpiderStatuses.STARTED.equals(spider.getStatus())) { return; } if (SpiderStatuses.KILLED.equals(spider.getStatus())) { throw new MandrelException("Spider cancelled!"); } // Can not start a spider if there no frontier started if (discoveryClient.getInstances(ServiceIds.frontier()).size() < 1) { throw new MandrelException("Can not start spider, you need a least a frontier instance!"); } if (SpiderStatuses.INITIATED.equals(spider.getStatus())) { // injectSingletonSources(spider); } spiderRepository.updateStatus(spiderId, SpiderStatuses.STARTED); updateTimeline(spider, SpiderEventType.SPIDER_CREATED); } public void injectSingletonSources(Spider spider) { // Deploy singleton sources on a random frontier TaskContext context = new TaskContext(); context.setDefinition(spider); spider.getSources().forEach(s -> { Source source = s.build(context); if (source.singleton() && source.check()) { log.debug("Injecting source '{}' ({})", s.name(), s.toString()); ServiceInstance instance = discoveryClient.getInstances(ServiceIds.frontier()).get(0); source.register(uri -> { try { log.trace("Adding uri '{}'", uri); clients.onFrontier(instance.getHostAndPort()) .with(frontier -> frontier.schedule(spider.getId(), uri)); } catch (Exception e) { log.warn("Can not sync due to", e); } }); } }); } public void pause(long spiderId) { Spider spider = get(spiderId); // Update status spiderRepository.updateStatus(spiderId, SpiderStatuses.PAUSED); updateTimeline(spider, SpiderEventType.SPIDER_PAUSED); } public void kill(long spiderId) { Spider spider = get(spiderId); // Update status spiderRepository.updateStatus(spiderId, SpiderStatuses.KILLED); updateTimeline(spider, SpiderEventType.SPIDER_KILLED); } public void delete(long spiderId) { Spider spider = get(spiderId); // Update status spiderRepository.updateStatus(spiderId, SpiderStatuses.DELETED); updateTimeline(spider, SpiderEventType.SPIDER_DELETED); metricsService.delete(spiderId); } }