io.mandrel.data.extract.ExtractorService.java Source code

Introduction

Here is the source code for io.mandrel.data.extract.ExtractorService.java
Source

/*
 * Licensed to Mandrel under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Mandrel licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package io.mandrel.data.extract;

import io.mandrel.blob.Blob;
import io.mandrel.common.data.Spider;
import io.mandrel.common.net.Uri;
import io.mandrel.data.Link;
import io.mandrel.data.content.DataExtractor;
import io.mandrel.data.content.DefaultDataExtractor;
import io.mandrel.data.content.OutlinkExtractor;
import io.mandrel.data.content.selector.DataConverter;
import io.mandrel.data.content.selector.Selector.Instance;
import io.mandrel.document.Document;
import io.mandrel.document.DocumentStores;
import io.mandrel.metadata.MetadataStores;
import io.mandrel.script.ScriptingService;

import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import javax.inject.Inject;

import lombok.extern.slf4j.Slf4j;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.springframework.stereotype.Component;

import us.codecraft.xsoup.xevaluator.XElement;

@Component
@Slf4j
public class ExtractorService {

    private final ScriptingService scriptingService;

    @Inject
    public ExtractorService(ScriptingService scriptingService) {
        super();
        this.scriptingService = scriptingService;
    }

    public Pair<Set<Link>, Set<Link>> extractAndFilterOutlinks(Spider spider, Uri uri,
            Map<String, Instance<?>> cachedSelectors, Blob blob, OutlinkExtractor ol) {
        // Find outlinks in page
        Set<Link> outlinks = extractOutlinks(cachedSelectors, blob, ol);
        log.trace("Finding outlinks for url {}: {}", uri, outlinks);

        // Filter outlinks
        Set<Link> filteredOutlinks = null;
        if (outlinks != null) {
            Stream<Link> stream = outlinks.stream().filter(l -> l != null && l.getUri() != null);
            if (spider.getFilters() != null && CollectionUtils.isNotEmpty(spider.getFilters().getLinks())) {
                stream = stream
                        .filter(link -> spider.getFilters().getLinks().stream().allMatch(f -> f.isValid(link)));
            }
            filteredOutlinks = stream.collect(Collectors.toSet());
        }

        Set<Link> allFilteredOutlinks = null;
        if (filteredOutlinks != null) {
            Set<Uri> res = MetadataStores.get(spider.getId())
                    .deduplicate(filteredOutlinks.stream().map(l -> l.getUri()).collect(Collectors.toList()));
            allFilteredOutlinks = filteredOutlinks.stream().filter(f -> res.contains(f.getUri()))
                    .collect(Collectors.toSet());
        }

        log.trace("And filtering {}", allFilteredOutlinks);
        return Pair.of(outlinks, allFilteredOutlinks);
    }

    public Set<Link> extractOutlinks(Map<String, Instance<?>> cachedSelectors, Blob blob,
            OutlinkExtractor extractor) {

        List<Link> outlinks = DefaultDataExtractor.extract(cachedSelectors, blob, null, extractor.getExtractor(),
                new DataConverter<XElement, Link>() {
                    public Link convert(XElement element) {
                        Link link = new Link();

                        String uri = element.getElement().absUrl("href");
                        link.setUri(StringUtils.isNotBlank(uri) ? Uri.create(uri) : null);

                        String rel = element.getElement().attr("rel");
                        link.setRel(StringUtils.isNotBlank(rel) ? rel : null);

                        String title = element.getElement().attr("title");
                        link.setTitle(StringUtils.isNotBlank(title) ? title : null);

                        String text = element.getElement().ownText();
                        link.setText(StringUtils.isNotBlank(text) ? text : null);
                        return link;
                    }
                });

        if (outlinks != null && !outlinks.isEmpty()) {
            outlinks = (List<Link>) DefaultDataExtractor.format(scriptingService, blob.getMetadata(), extractor,
                    outlinks);
        }

        return new HashSet<>(outlinks);
    }

    public List<Document> extractThenFormatThenStore(long spiderId, Map<String, Instance<?>> cachedSelectors,
            Blob blob, DataExtractor extractor) {

        List<Document> documents = extractThenFormat(cachedSelectors, blob, extractor);

        // Store the result
        if (documents != null) {
            DocumentStores.get(spiderId, extractor.getName()).get().save(documents);
        }

        return documents;
    }

    public List<Document> extractThenFormat(Map<String, Instance<?>> cachedSelectors, Blob blob,
            DataExtractor extractor) {
        return extractor.extract(scriptingService, cachedSelectors, blob);
    }
}