io.seldon.importer.articles.AttributesImporterUtils.java Source code

Java tutorial

Introduction

Here is the source code for io.seldon.importer.articles.AttributesImporterUtils.java

Source

/*
 * Seldon -- open source prediction engine
 * =======================================
 * Copyright 2011-2015 Seldon Technologies Ltd and Rummble Ltd (http://www.seldon.io/)
 *
 **********************************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at       
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ********************************************************************************************** 
*/
package io.seldon.importer.articles;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.codehaus.jackson.JsonGenerationException;
import org.codehaus.jackson.map.JsonMappingException;
import org.codehaus.jackson.map.ObjectMapper;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class AttributesImporterUtils {

    public static Set<String> getTags(Document articleDoc, String tagsCssSelector, String title) {
        Set<String> tagSet = new HashSet<String>();

        if (StringUtils.isNotBlank(tagsCssSelector)) {
            Elements tagsElements = articleDoc.select(tagsCssSelector);
            Element tagsElement = tagsElements.first();
            List<String> tagsParts;
            if ((tagsElement != null) && (tagsElement.attr("content") != null)
                    && (StringUtils.isNotBlank(tagsElement.attr("content")))) {
                tagsParts = AttributesImporterUtils.getTagsPartsFromSingleElement(tagsElement);
            } else {
                tagsParts = AttributesImporterUtils.getTagsPartsFromMultipleElement(tagsElements);

            }
            List<String> extraTagsParts = AttributesImporterUtils.createExtraTagsPartsFromTitle(title, tagsParts);
            tagSet.addAll(tagsParts);
            tagSet.addAll(extraTagsParts);
        }

        return tagSet;
    }

    // for like -tagsSelector "head > meta[name=keywords]"
    public static List<String> getTagsPartsFromSingleElement(Element tagsElement) {
        String tagsRaw = tagsElement.attr("content");
        String[] parts = tagsRaw.split(",");
        for (int i = 0; i < parts.length; i++)
            parts[i] = parts[i].trim().toLowerCase();

        List<String> tagsParts = (parts != null) ? new ArrayList<String>(Arrays.asList(parts))
                : new ArrayList<String>();

        return tagsParts;
    }

    // for like -tagsSelector "section[id=tags] > a"
    public static List<String> getTagsPartsFromMultipleElement(Elements tagsElements) {
        List<String> tagsParts = new ArrayList<String>();

        for (Element e : tagsElements) {
            String tag = e.text();
            tag = StringUtils.strip(tag);
            tag = tag.toLowerCase();
            tagsParts.add(tag);
        }

        return tagsParts;
    }

    public static List<String> createExtraTagsPartsFromTitle(String title, List<String> tagsParts) {
        List<String> tileParts = new ArrayList<String>();

        boolean haveTitle = StringUtils.isNotBlank(title);
        String titleLower = title.toLowerCase();
        String last_part = null;
        for (String part : tagsParts) {
            if (StringUtils.isNotBlank(part)) {
                if (haveTitle && (last_part != null) && StringUtils.isNotBlank(last_part)) {
                    String phrase = last_part + " " + part;
                    if (titleLower.contains(phrase))
                        tileParts.add(phrase.toLowerCase());
                }
            }
            last_part = part;
        }

        return tileParts;
    }

    public static void logResult(Logger logger, ItemProcessResult itemProcessResult) {

        String v = "none";
        ObjectMapper mapper = new ObjectMapper();
        try {
            v = mapper.writeValueAsString(itemProcessResult);
        } catch (JsonGenerationException e) {
            e.printStackTrace();
        } catch (JsonMappingException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        logger.info("ItemProcessResult: " + v);
    }

    public static String getBaseUrl(String url) throws Exception {
        URL aURL = new URL(url);

        String protocol = aURL.getProtocol();
        String host = aURL.getHost();
        String path = aURL.getPath();

        String baseUrl = String.format("%s://%s%s", protocol, host, path);
        return baseUrl;
    }
}