opennlp.tools.parse_thicket.opinion_processor.YouTubeMiner.java Source code

Java tutorial

Introduction

Here is the source code for opennlp.tools.parse_thicket.opinion_processor.YouTubeMiner.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package opennlp.tools.parse_thicket.opinion_processor;

import java.io.File;
import java.io.IOException;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;

import opennlp.tools.similarity.apps.utils.PageFetcher;

public class YouTubeMiner {
    private PageFetcher fetcher = new PageFetcher();

    public YouTubeMinerResult getData(String url) {
        YouTubeMinerResult result = new YouTubeMinerResult();
        String content = fetcher.fetchOrigHTML(url);
        try {
            FileUtils.writeStringToFile(new File(url.replace(':', '_').replace('/', '_')), content);
        } catch (IOException e1) {
            // TODO Auto-generated catch block
            e1.printStackTrace();
        }
        if (url.indexOf("channel") > -1) {
            try { // subscriber-count" title="30"
                String subscribersStr = StringUtils.substringBetween(content, "subscriber-count", "tabindex");
                String dirtyNumber = StringUtils.substringBetween(subscribersStr, "title=\"", "\"");
                String cleanNumber = dirtyNumber.replaceAll("[^\\x00-\\x7F]", "");
                if (cleanNumber != null) {
                    int subscribers = Integer.parseInt(cleanNumber);
                    result.subscribers = subscribers;
                } else {
                    System.err.println("Not found data for 'subscriber-count', 'tabindex'");
                }
            } catch (NumberFormatException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        } else {
            try {

                String subscribersStr = StringUtils.substringBetween(content, "subscriber-count", "tabindex");
                String dirtyNumber = StringUtils.substringBetween(subscribersStr, "title=\"", "\"").replace(" ",
                        "");
                if (dirtyNumber != null) {
                    int subscribers = Integer.parseInt(dirtyNumber);
                    result.subscribers = subscribers;
                } else {
                    System.err.println("Not found data for 'subscriber-count', 'tabindex'");
                }

                String viewsStrDirty = StringUtils.substringBetween(content,
                        // "div class=\"watch-view-count\">"," views</div>");
                        // view-count">12 ?</div>
                        "view-count", "<div>");
                String viewsStr = StringUtils.substringBetween(viewsStrDirty, ">", " ");
                if (viewsStr != null) {
                    int views = Integer.parseInt(viewsStr);
                    result.views = views;
                } else {
                    System.err.println("Not found data for 'view-count','<div>'");
                }
            } catch (NumberFormatException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }

        return result;
    }

    public static void main(String[] args) {
        YouTubeMiner miner = new YouTubeMiner();
        System.out.println(miner.getData("https://www.youtube.com/channel/UC-maQbG5eUS5c1wmaTnLwTA"));
        System.out.println(miner.getData("https://www.youtube.com/watch?v=U6X4VT9dVr8"));
        System.out.println(miner.getData("https://www.youtube.com/watch?v=kH-AQnta714"));
        System.out.println(miner.getData("https://www.youtube.com/watch?v=pWb50Kn1ShQ"));
    }
}