org.loklak.harvester.YoutubeScraper.java Source code

Java tutorial

Introduction

Here is the source code for org.loklak.harvester.YoutubeScraper.java

Source

/**
 *  YoutubeScraper
 *  Copyright 22.03.2016 by Michael Peter Christen, @0rb1t3r
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *  
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *  
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package org.loklak.harvester;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.loklak.tools.CharacterCoding;

public class YoutubeScraper {

    public final static ExecutorService executor = Executors.newFixedThreadPool(40);

    private final static String[] html_tags = new String[] { "title" };
    private final static String[] microformat_vocabularies = new String[] { "og", "twitter" };

    public static JSONObject parseVideo(File file) throws IOException {
        FileInputStream fis = new FileInputStream(file);
        JSONObject json = parseVideo(fis);
        fis.close();
        return json;
    }

    public static JSONObject parseVideo(InputStream is) throws IOException {
        BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
        JSONObject json = parseVideo(reader);
        reader.close();
        return json;
    }

    public static JSONObject parseVideo(final BufferedReader br) throws IOException {
        String input;
        JSONObject json = new JSONObject(true);
        boolean parse_span = false, parse_license = false;
        String itemprop = "", itemtype = ""; // values for span
        while ((input = br.readLine()) != null)
            try {
                input = input.trim();
                //System.out.println(input); // uncomment temporary to debug or add new fields
                int p;

                if (parse_license) {
                    if ((p = input.indexOf("<li")) >= 0) {
                        String tag = parseTag(input, p);
                        if (tag == null)
                            continue;
                        if (tag.startsWith("<a ")) {
                            tag = parseTag(tag, 0);
                            addRDF(new String[] { "youtube", "category", tag }, json);
                        } else {
                            addRDF(new String[] { "youtube", "license", tag }, json);
                        }
                        parse_license = false;
                        continue;
                    }
                } else if (parse_span) {
                    if ((p = input.indexOf("itemprop=\"")) >= 0) {
                        String[] token = parseItemprop(input, p, new String[] { "href", "content" }, "");
                        if (token == null)
                            continue;
                        int q = itemtype.indexOf("//");
                        if (q < 0)
                            continue;
                        String subject = itemtype.substring(q + 2).replace('.', '_').replace('/', '_');
                        String predicate = itemprop + "_" + token[1];
                        String object = token[2];
                        addRDF(new String[] { subject, predicate, object }, json);
                        continue;
                    }
                    if (input.indexOf("</span>") >= 0) {
                        parse_span = false;
                        continue;
                    }
                } else {
                    tags: for (String tag : html_tags) {
                        if ((p = input.indexOf("<" + tag)) >= 0) {
                            addRDF(new String[] { "html", tag, parseTag(input, p) }, json);
                            continue tags;
                        }
                    }
                    vocs: for (String subject : microformat_vocabularies) {
                        if ((p = input.indexOf("property=\"" + subject + ":")) >= 0) {
                            addRDF(parseMicroformat(input, "property", p), json);
                            continue vocs;
                        }
                        if ((p = input.indexOf("name=\"" + subject + ":")) >= 0) {
                            addRDF(parseMicroformat(input, "name", p), json);
                            continue vocs;
                        }
                    }
                    if ((p = input.indexOf("span itemprop=\"")) >= 0) {
                        String[] token = parseItemprop(input, p, new String[] { "itemtype" }, "");
                        if (token == null)
                            continue;
                        itemprop = token[1];
                        itemtype = token[2];
                        parse_span = true;
                        continue;
                    }
                    if ((p = input.indexOf("itemprop=\"")) >= 0) {
                        String[] token = parseItemprop(input, p, new String[] { "content" }, "youtube");
                        if (token == null)
                            continue;
                        addRDF(token, json);
                        continue;
                    }
                    if ((p = input.indexOf("class=\"content watch-info-tag-list")) >= 0) {
                        parse_license = true;
                        continue;
                    }
                    if ((p = input.indexOf("yt-subscriber-count")) >= 0) {
                        String subscriber_string = parseProp(input, p, "title");
                        if (subscriber_string == null)
                            continue;
                        json.put("youtube_subscriber", parseNumber(subscriber_string));
                        continue;
                    }
                    if (input.indexOf("\"like this") > 0 && (p = input.indexOf("yt-uix-button-content")) >= 0) {
                        String likes_string = parseTag(input, p);
                        json.put("youtube_likes", parseNumber(likes_string));
                        continue;
                    }
                    if (input.indexOf("\"dislike this") > 0 && (p = input.indexOf("yt-uix-button-content")) >= 0) {
                        String dislikes_string = parseTag(input, p);
                        json.put("youtube_dislikes", parseNumber(dislikes_string));
                        continue;
                    }
                    if ((p = input.indexOf("watch-view-count")) >= 0) {
                        String viewcount_string = parseTag(input, p);
                        if (viewcount_string == null)
                            continue;
                        viewcount_string = viewcount_string.replace(" views", "");
                        if (viewcount_string.length() == 0)
                            continue;
                        long viewcount = 0;
                        // if there are no views, there may be a string saying "No". But this is done in all languages, so we just catch a NumberFormatException
                        try {
                            viewcount = parseNumber(viewcount_string);
                        } catch (NumberFormatException e) {
                        }
                        json.put("youtube_viewcount", viewcount);
                        continue;
                    }
                    if ((p = input.indexOf("watch?v=")) >= 0) {
                        p += 8;
                        int q = input.indexOf("\"", p);
                        if (q > 0) {
                            String videoid = input.substring(p, q);
                            int r = videoid.indexOf('&');
                            if (r > 0)
                                videoid = videoid.substring(0, r);
                            addRDF(new String[] { "youtube", "next", videoid }, json);
                            continue;
                        }
                    }
                    if ((p = input.indexOf("playlist-header-content")) >= 0) {
                        String playlist_title = parseProp(input, p, "data-list-title");
                        if (playlist_title == null)
                            continue;
                        addRDF(new String[] { "youtube", "playlist_title", playlist_title }, json);
                        continue;
                    }
                    if ((p = input.indexOf("yt-uix-scroller-scroll-unit")) >= 0) {
                        String playlist_videoid = parseProp(input, p, "data-video-id");
                        if (playlist_videoid == null)
                            continue;
                        addRDF(new String[] { "youtube", "playlist_videoid", playlist_videoid }, json);
                        continue;
                    }
                    if ((p = input.indexOf("watch-description-text")) >= 0) {
                        p = input.indexOf('>', p);
                        int q = input.indexOf("</div", p);
                        String text = input.substring(p + 1, q < 0 ? input.length() : q);
                        text = paragraph.matcher(brend.matcher(text).replaceAll("\n")).replaceAll("").trim();
                        Matcher m;
                        anchor_loop: while ((m = anchor_pattern.matcher(text)).find())
                            try {
                                text = m.replaceFirst(m.group(1) + " ");
                            } catch (IllegalArgumentException e) {
                                text = "";
                                break anchor_loop;
                            }
                        text = CharacterCoding.html2unicode(text);
                        json.put("youtube_description", text);
                        continue;
                    }
                }
            } catch (Throwable e) {
                e.printStackTrace();
                System.err.println("error in video " + json.toString(2));
                System.err.println("current line: " + input);
                System.exit(0);
            }
        br.close();
        return json;
    }

    private static long parseNumber(String n) throws NumberFormatException {
        return Long.parseLong(numberfix.matcher(n).replaceAll(""));
    }

    private final static Pattern numberfix = Pattern.compile(",|\\.");
    private final static Pattern paragraph = Pattern.compile("<p.*>|</p.*>");
    private final static Pattern brend = Pattern.compile("<br />");
    private final static Pattern anchor_pattern = Pattern.compile("<a .*?>(.*?)</a>");

    private static String[] parseMicroformat(String line, String key, int start) {
        int p = line.indexOf(key + "=\"", start);
        if (p < 0)
            return null;
        p += key.length() + 2;
        int c = line.indexOf(":", p);
        if (c < 0)
            return null;
        int q = line.indexOf("\"", c);
        if (q < 0)
            return null;
        int r = line.indexOf("content=\"", q);
        if (r < 0)
            return null;
        r += 9;
        int s = line.indexOf("\"", r);
        if (s < 0)
            return null;
        // this is a rdf statement
        String subject = line.substring(p, c).replace(':', '_');
        String predicate = line.substring(c + 1, q).replace(':', '_');
        String object = line.substring(r, s);
        return new String[] { subject, predicate, object };
    }

    private static String[] parseItemprop(String line, int start, String[] objectnames, String subject) {
        int p = line.indexOf("itemprop=\"", start);
        if (p < 0)
            return null;
        p += 10;
        int q = line.indexOf("\"", p);
        if (q < 0)
            return null;
        int r = -1;
        objectscan: for (String objectname : objectnames) {
            r = line.indexOf(objectname + "=\"", q);
            if (r < 0)
                continue objectscan;
            r += objectname.length() + 2;
            break;
        }
        if (r < 0)
            return null;
        int s = line.indexOf("\"", r);
        if (s < 0)
            return null;
        // this becomes a rdf statement
        String predicate = line.substring(p, q).replace(':', '_');
        String object = line.substring(r, s);
        return new String[] { subject, predicate, object };
    }

    private static void addRDF(String[] spo, JSONObject json) {
        if (spo == null)
            return;
        String subject = spo[0];
        String predicate = spo[1];
        String object = CharacterCoding.html2unicode(spo[2]);
        if (subject.length() == 0 || predicate.length() == 0 || object.length() == 0)
            return;
        String key = subject + "_" + predicate;
        JSONArray objects = null;
        try {
            objects = json.getJSONArray(key);
        } catch (JSONException e) {
            objects = new JSONArray();
            json.put(key, objects);
        }
        // double-check (wtf why is ths that complex?)
        for (Object o : objects) {
            if (o instanceof String && ((String) o).equals(object))
                return;
        }
        // add the object to the objects
        objects.put(object);
    }

    private static String parseProp(String line, int start, String key) {
        int p = line.indexOf(key + "=\"", start);
        if (p > 0) {
            int q = line.indexOf('"', p + key.length() + 2);
            if (q > 0) {
                return line.substring(p + key.length() + 2, q);
            }
        }
        return null;
    }

    private static String parseTag(String line, int start) {
        int p = line.indexOf('>', start);
        if (p < 0)
            return null;
        int c = 1; // we count the number of open tags and stop if the number is zero. We already passed the first tag which is c = 1
        int q = p + 1; // start scan at the next position
        while (c > 0 && q < line.length() - 1) {
            char a = line.charAt(q);
            if (a == '<') {
                if (line.charAt(q + 1) != 'i') {
                    if (line.charAt(q + 1) == '/')
                        c--;
                    else
                        c++;
                }
            }
            q++;
        }
        if (c != 0)
            return "";
        return line.substring(p + 1, q - 1).trim();
    }

}