io.seldon.importer.articles.FileItemAttributesImporter.java Source code

Java tutorial

Introduction

Here is the source code for io.seldon.importer.articles.FileItemAttributesImporter.java

Source

/*
 * Seldon -- open source prediction engine
 * =======================================
 * Copyright 2011-2015 Seldon Technologies Ltd and Rummble Ltd (http://www.seldon.io/)
 *
 **********************************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at       
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ********************************************************************************************** 
*/
package io.seldon.importer.articles;

import io.seldon.client.DefaultApiClient;
import io.seldon.client.beans.ItemBean;
import io.seldon.client.exception.ApiException;
import io.seldon.importer.articles.category.CategoryExtractor;
import io.seldon.utils.CollectionTools;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.sampullara.cli.Args;
import com.sampullara.cli.Argument;

public class FileItemAttributesImporter {

    private static Logger logger = Logger.getLogger(FileItemAttributesImporter.class.getName());

    @Argument(alias = "n", description = "How many items to import", required = false)
    static Integer numItems = 500;

    @Argument(alias = "i", description = "Interval in secs between runs", required = false)
    static Integer intervalSecs = 600;

    @Argument(alias = "gt", description = "Timeout on article http GET", required = false)
    static Integer httpGetTimeout = 2000;

    @Argument(alias = "api-url", description = "API Endpoint", required = true)
    static String apiUrl;

    @Argument(alias = "consumer-key", description = "Consumer Key", required = true)
    static String consumerKey;

    @Argument(alias = "consumer-secret", description = "Consumer Secret", required = true)
    static String consumerSecret;

    @Argument(alias = "it", description = "Item type", required = false, delimiter = ",")
    static Integer[] itemTypes = new Integer[] { 0, 1 };

    /*
     * CSS Selector arguments
     */

    @Argument(alias = "imageSelector", description = "Image CSS Selector", required = false)
    static String imageCssSelector = "head > meta[property=og:image]";

    @Argument(alias = "titleSelector", description = "Title CSS Selector", required = false)
    static String titleCssSelector = "head > meta[property=og:title]";

    @Argument(alias = "leadTextSelector", description = "Lead Text CSS Selector", required = false)
    static String leadTextCssSelector = "";

    @Argument(alias = "textSelector", description = "Article text CSS Selector", required = false)
    static String textCssSelector;

    @Argument(alias = "tagsSelector", description = "Tags CSS Selector", required = false)
    static String tagsCssSelector = "head > meta[name=keywords]";

    @Argument(alias = "categorySelector", description = "Category CSS Selector", required = false)
    static String categoryCssSelector;

    @Argument(alias = "subCategorySelector", description = "Sub Category CSS Selector", required = false)
    static String subCategoryCssSelector;

    @Argument(alias = "linkSelector", description = "URL link CSS Selector", required = false)
    static String linkCssSelector;

    @Argument(alias = "publishDateSelector", description = "publish date CSS Selector", required = false)
    static String publishDateCssSelector;

    @Argument(alias = "urls", description = "File containing list of URLs", required = true)
    static String urlFile;

    @Argument(alias = "minFetch", description = "Min time between url requests in msecs", required = false)
    static Integer minFetchGapMsecs = 500;

    /*
     * Arguments to allow success even if we don't find some elements
     */

    @Argument(alias = "noimage", description = "is it ok not to find an image", required = false)
    static boolean imageNotNeeded = false;

    @Argument(alias = "nocategory", description = "is it ok not to find a category", required = false)
    static boolean categoryNotNeeded = false;

    /*
     * Argument for using Domain
     */
    @Argument(alias = "needdomain", description = "Should doamin be used as an attrinute", required = false)
    static boolean domainIsNeeded = false;

    /*
     * Defaults
     */

    @Argument(alias = "defImage", description = "Default image url", required = false)
    static String defImageUrl;

    @Argument(alias = "categoryPrefix", description = "The prefix for the supplied category extractor - will be io.seldon.importer.articles.category.<Prefix>CategoryExtractor", required = false)
    static String categoryClassPrefix = "GeneralFirst";

    @Argument(alias = "subCategoryPrefix", description = "The prefix for the supplied sub category extractor - will be io.seldon.importer.articles.category.<Prefix>SubCategoryExtractor", required = false)
    static String subCategoryClassPrefix = ""; //"GeneralAll";

    @Argument(alias = "t", description = "For testing, will not update", required = false)
    static boolean testmode = false;

    static int API_TIMEOUT = 10000;
    static String ATTR_IMG_NAME = "img_url";
    static final String ATTR_CATEGORIES = "categories";
    static final String ATTR_TITLE = "title";

    static final String CONTENT_TYPE_ARTICLE_VALID = "article";
    static final String CONTENT_TYPE_ARTICLE_INVALID = "old_article";

    static final int TYPE_NOT_VALID = 2;
    static final int TYPE_NOT_SET = 0;
    static final int TYPE_VALID = 1;

    static final String UNVERIFIED_CONTENT_TYPE = "unverified_article";
    static final String VERIFIED_CONTENT_TYPE = "article";

    //field
    static String CONTENT_TYPE = "content_type";
    static String TITLE = "title";
    static String CATEGORY = "category";
    static String SUBCATEGORY = "subcategory";
    static String IMG_URL = "img_url";
    static String DESCRIPTION = "description";
    static String TAGS = "tags";
    static String LEAD_TEXT = "leadtext";
    static String LINK = "link";
    static String PUBLISH_DATE = "published_date";
    static String DOMAIN = "domain";

    private int total_item_processed_count = 0;
    private int total_item_succeded_count = 0;

    private FailFast failFast = null;

    DefaultApiClient client;

    static long lastUrlFetchTime = 0;

    public FileItemAttributesImporter(DefaultApiClient client) {
        this.client = client;
    }

    public void run() throws InterruptedException {
        logger.info("Starting...");
        logger.info("Processing recent urls...");
        int updates = process();
        logger.info("Processed with " + updates + " updates");
        logger.info("Processed urls...Finished");
        if (failFast != null) {
            failFast.stopChecking(); // We are exiting normally so no need to check the main thread is going to die
        }
    }

    public static String getUrlEncodedString(String input) {
        URL url = null;
        try {
            url = new URL(input);

            URI uri = new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), null);

            String encoded = uri.toASCIIString();

            return encoded;

        } catch (MalformedURLException mue) {
            logger.error("Malformed url " + input);
            return null;
        } catch (URISyntaxException e) {
            logger.error("Failed to tranform url into uri ", e);
            return null;
        }
    }

    public int process() {
        int updates = 0;
        try {
            Map<String, ItemBean> itemMap = new HashMap<String, ItemBean>();
            List<ItemBean> items = new ArrayList<ItemBean>();
            for (int i = 0; i < itemTypes.length; i++) {
                List<ItemBean> itemsForType = client.getItems(numItems, itemTypes[i], true, "last_action");
                logger.info("Adding " + itemsForType.size() + " items for item type " + itemTypes[i]);
                items.addAll(itemsForType);
            }

            logger.info("Got " + items.size() + " items from API");
            for (ItemBean item : items)
                itemMap.put(item.getId(), item);

            BufferedReader reader = new BufferedReader(new FileReader(urlFile));
            String url;
            int count = 0;
            while ((url = reader.readLine()) != null) {
                count++;
                ItemBean item = itemMap.get(url);
                String contentType = null;
                if (item == null)
                    item = new ItemBean(url, "", 1);
                else
                    contentType = item.getAttributesName().get(ItemAttributesImporter.CONTENT_TYPE);
                if (item.getType() == ItemAttributesImporter.TYPE_NOT_SET || (contentType == null
                        || ItemAttributesImporter.UNVERIFIED_CONTENT_TYPE.equals(contentType))) {
                    total_item_processed_count++;
                    logger.info("Looking at item " + count);
                    System.out.println("Item => " + item.toString());
                    boolean imported = false;
                    try {
                        String category = null;
                        if (item.getAttributesName() != null)
                            category = item.getAttributesName().get(ItemAttributesImporter.CATEGORY);

                        Map<String, String> attributes = getAttributes(item.getId(), category);
                        if (attributes != null) {
                            updates++;
                            total_item_succeded_count++;
                            item.setName(attributes.get(ItemAttributesImporter.ATTR_TITLE));
                            item.setAttributesName(attributes);
                            item.setType(ItemAttributesImporter.TYPE_VALID);
                            item.setFirst_action(new Date());
                            item.setLast_action(new Date());
                            if (!testmode) {
                                client.updateItem(item);
                            } else {
                                logger.info("TESTMODE skipping update");
                            }
                            imported = true;
                        }
                    } catch (Exception e) {
                        logger.warn("Article:" + item.getId() + " error.", e);
                    }

                    String updated_amount_string = String.format("[%d/%d %.0f%%]", total_item_succeded_count,
                            total_item_processed_count,
                            ((((double) total_item_succeded_count / (double) total_item_processed_count)) * 100));
                    if (imported) {
                        logger.info("Article : " + item.getId() + " import - OK " + updated_amount_string);
                        logger.info("Item : " + item);
                    } else {
                        logger.info("Article : " + item.getId() + " import - NOT OK " + updated_amount_string);
                    }
                } else {
                    logger.info("Article : " + item.getId() + " SKIPPED");
                }
            }
            reader.close();
        } catch (ApiException e) {
            logger.error("Failed api call", e);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            logger.error("io exception", e);
        }
        return updates;
    }

    public static Map<String, String> getAttributes(String url, String existingCategory) {
        ItemProcessResult itemProcessResult = new ItemProcessResult();
        itemProcessResult.client_item_id = url;
        itemProcessResult.extraction_status = "EXTRACTION_FAILED";

        logger.info("Trying to get attributes for " + url);
        Map<String, String> attributes = null;
        String title = "";
        String category = "";
        String subCategory = "";
        String img_url = "";
        String description = "";
        String tags = "";
        String leadtext = "";
        String link = "";
        String publishDate = "";
        String domain = "";
        try {
            long now = System.currentTimeMillis();
            long timeSinceLastRequest = now - lastUrlFetchTime;
            if (timeSinceLastRequest < minFetchGapMsecs) {
                long timeToSleep = minFetchGapMsecs - timeSinceLastRequest;
                logger.info(
                        "Sleeping " + timeToSleep + "msecs as time since last fetch is " + timeSinceLastRequest);
                Thread.sleep(timeToSleep);
            }
            Document articleDoc = Jsoup.connect(url).userAgent("SeldonBot/1.0").timeout(httpGetTimeout).get();
            lastUrlFetchTime = System.currentTimeMillis();
            //get IMAGE URL
            if (StringUtils.isNotBlank(imageCssSelector)) {
                Element imageElement = articleDoc.select(imageCssSelector).first();
                if (imageElement != null) {
                    if (imageElement.attr("content") != null) {
                        img_url = imageElement.attr("content");
                    }
                    if (StringUtils.isBlank(img_url) && imageElement.attr("src") != null) {
                        img_url = imageElement.attr("src");
                    }
                    if (StringUtils.isBlank(img_url) && imageElement.attr("href") != null) {
                        img_url = imageElement.attr("href");
                    }

                }
            }
            if (StringUtils.isBlank(img_url) && StringUtils.isNotBlank(defImageUrl)) {
                logger.info("Setting image to default: " + defImageUrl);
                img_url = defImageUrl;
            }
            img_url = StringUtils.strip(img_url);

            //get TITLE
            if (StringUtils.isNotBlank(titleCssSelector)) {
                Element titleElement = articleDoc.select(titleCssSelector).first();
                if (titleElement != null && titleElement.attr("content") != null) {
                    title = titleElement.attr("content");
                }
            }

            //get Lead Text
            if (StringUtils.isNotBlank(leadTextCssSelector)) {
                Element leadElement = articleDoc.select(leadTextCssSelector).first();
                if (leadElement != null && leadElement.attr("content") != null) {
                    leadtext = leadElement.attr("content");
                }
            }

            //get publish date
            if (StringUtils.isNotBlank(publishDateCssSelector)) {
                //2013-01-21T10:40:55Z
                Element pubElement = articleDoc.select(publishDateCssSelector).first();
                if (pubElement != null && pubElement.attr("content") != null) {
                    String pubtext = pubElement.attr("content");
                    SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                    DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
                    Date result = null;
                    try {
                        result = df.parse(pubtext);
                    } catch (ParseException e) {
                        logger.info("Failed to parse date withUTC format " + pubtext);
                    }
                    //try a simpler format
                    df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH);
                    try {
                        result = df.parse(pubtext);
                    } catch (ParseException e) {
                        logger.info("Failed to parse date " + pubtext);
                    }

                    if (result != null)
                        publishDate = dateFormatter.format(result);
                    else
                        logger.error("Failed to parse date " + pubtext);
                }
            }

            //get Link
            if (StringUtils.isNotBlank(linkCssSelector)) {
                Element linkElement = articleDoc.select(linkCssSelector).first();
                if (linkElement != null && linkElement.attr("content") != null) {
                    link = linkElement.attr("content");
                }
            }

            //get CONTENT
            if (StringUtils.isNotBlank(textCssSelector)) {
                Element descriptionElement = articleDoc.select(textCssSelector).first();
                if (descriptionElement != null)
                    description = Jsoup.parse(descriptionElement.html()).text();
            }

            //get TAGS
            Set<String> tagSet = AttributesImporterUtils.getTags(articleDoc, tagsCssSelector, title);

            if (tagSet.size() > 0)
                tags = CollectionTools.join(tagSet, ",");

            //get CATEGORY - client specific
            if (StringUtils.isNotBlank(categoryCssSelector)) {
                Element categoryElement = articleDoc.select(categoryCssSelector).first();
                if (categoryElement != null && categoryElement.attr("content") != null) {
                    category = categoryElement.attr("content");
                    if (StringUtils.isNotBlank(category))
                        category = category.toUpperCase();
                }
            } else if (StringUtils.isNotBlank(categoryClassPrefix)) {
                String className = "io.seldon.importer.articles.category." + categoryClassPrefix
                        + "CategoryExtractor";
                Class<?> clazz = Class.forName(className);
                Constructor<?> ctor = clazz.getConstructor();
                CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance();
                category = extractor.getCategory(url, articleDoc);
            }

            //get Sub CATEGORY - client specific
            if (StringUtils.isNotBlank(subCategoryCssSelector)) {
                Element subCategoryElement = articleDoc.select(subCategoryCssSelector).first();
                if (subCategoryElement != null && subCategoryElement.attr("content") != null) {
                    subCategory = subCategoryElement.attr("content");
                    if (StringUtils.isNotBlank(subCategory))
                        subCategory = category.toUpperCase();
                }
            } else if (StringUtils.isNotBlank(subCategoryClassPrefix)) {
                String className = "io.seldon.importer.articles.category." + subCategoryClassPrefix
                        + "SubCategoryExtractor";
                Class<?> clazz = Class.forName(className);
                Constructor<?> ctor = clazz.getConstructor();
                CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance();
                subCategory = extractor.getCategory(url, articleDoc);
            }

            // Get domain
            if (domainIsNeeded) {
                domain = getDomain(url);
            }

            if (StringUtils.isNotBlank(title) && (imageNotNeeded || StringUtils.isNotBlank(img_url))
                    && (categoryNotNeeded || StringUtils.isNotBlank(category))
                    && (!domainIsNeeded || StringUtils.isNotBlank(domain))) {
                attributes = new HashMap<String, String>();
                attributes.put(TITLE, title);
                if (StringUtils.isNotBlank(category))
                    attributes.put(CATEGORY, category);
                if (StringUtils.isNotBlank(subCategory))
                    attributes.put(SUBCATEGORY, subCategory);
                if (StringUtils.isNotBlank(link))
                    attributes.put(LINK, link);
                if (StringUtils.isNotBlank(leadtext))
                    attributes.put(LEAD_TEXT, leadtext);
                if (StringUtils.isNotBlank(img_url))
                    attributes.put(IMG_URL, img_url);
                if (StringUtils.isNotBlank(tags))
                    attributes.put(TAGS, tags);
                attributes.put(CONTENT_TYPE, VERIFIED_CONTENT_TYPE);
                if (StringUtils.isNotBlank(description))
                    attributes.put(DESCRIPTION, description);
                if (StringUtils.isNotBlank(publishDate))
                    attributes.put(PUBLISH_DATE, publishDate);
                if (StringUtils.isNotBlank(domain))
                    attributes.put(DOMAIN, domain);
                System.out.println("Item: " + url + "; Category: " + category);
                itemProcessResult.extraction_status = "EXTRACTION_SUCCEEDED";
            } else {
                logger.warn("Failed to get title for article " + url);
                logger.warn("[title=" + title + ", img_url=" + img_url + ", category=" + category + ", domain="
                        + domain + "]");
            }

            { // check for failures for the log result
                if (StringUtils.isBlank(title)) {
                    itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list
                            + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "title";
                }
                if (!imageNotNeeded && StringUtils.isBlank(img_url)) {
                    itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list
                            + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "img_url";
                }
                if (!categoryNotNeeded && StringUtils.isBlank(category)) {
                    itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list
                            + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",")
                            + "category";
                }
            }
        } catch (Exception e) {
            logger.warn("Article: " + url + ". Attributes import FAILED", e);
            itemProcessResult.error = e.toString();
        }

        AttributesImporterUtils.logResult(logger, itemProcessResult);

        return attributes;
    }

    /**
     * @param args
     * @throws InterruptedException 
     * @throws FileNotFoundException 
     */
    public static void main(String[] args) throws InterruptedException, FileNotFoundException {

        FailFast failFast = new FailFast(Thread.currentThread());
        { // Fail Fast thread
            Thread fail_fast_thread = new Thread(failFast);
            fail_fast_thread.setName("fail_fast_thread");
            fail_fast_thread.start();
        }

        try {
            Args.parse(FileItemAttributesImporter.class, args);

            DefaultApiClient client = new DefaultApiClient(apiUrl, consumerKey, consumerSecret, API_TIMEOUT);

            FileItemAttributesImporter fixer = new FileItemAttributesImporter(client);

            fixer.setFailFast(failFast);
            fixer.run();

        } catch (IllegalArgumentException e) {
            e.printStackTrace();
            Args.usage(FileItemAttributesImporter.class);
        }
    }

    /**
     * 
     * @param url The domain to extract the domain from.
     * @return The domain or UNKOWN_DOMAN if unable to use url.
     */
    private static String getDomain(String url) {
        String retVal = "UNKOWN_DOMAN";
        if (!url.startsWith("http") && !url.startsWith("https")) {
            url = "http://" + url;
        }
        URL netUrl = null;
        try {
            netUrl = new URL(url);
        } catch (MalformedURLException e) {
            logger.warn("Failed to get domain for " + url);
        }
        if (netUrl != null) {
            String host = netUrl.getHost();
            retVal = host;
        }

        return retVal;
    }

    private void setFailFast(FailFast failFast) {
        this.failFast = failFast;
    }
}