edu.usc.cs.ir.selenium.handler.GlocktalkBasic.java Source code

Java tutorial

Introduction

Here is the source code for edu.usc.cs.ir.selenium.handler.GlocktalkBasic.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package edu.usc.cs.ir.selenium.handler;

import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.firefox.FirefoxDriver;

import edu.usc.cs.ir.selenium.model.InteractiveSeleniumHandler;

/**
 * Selenium Handler for http://www.glocktalk.com/ to extract weapon related posts
 * 
 * Command to extract URLs from this host:
 * curl -s "http://www.glocktalk.com" | grep -ioE 'href="[-A-Za-z0-9+&@#/%?=~_|!:,.;]*"' | awk -F "href=" '{print $2}'| grep -v "#"
 * 
 * Seed list is as below:
 * http://www.glocktalk.com/forum/general-firearms-forum.82/
 * http://www.glocktalk.com/forum/tactical-shotguns.180/
 * http://www.glocktalk.com/forum/black-rifle-forum.93/
 * http://www.glocktalk.com/forum/1911-forums.77/
 * http://www.glocktalk.com/forum/the-kalashnikov-klub.94/
 * http://www.glocktalk.com/forum/rimfire-forum.61/
 * http://www.glocktalk.com/forum/gun-related-clubs.164/
 * http://www.glocktalk.com/forum/general-competition.25/
 * http://www.glocktalk.com/forum/heckler-koch-forum.129/
 * http://www.glocktalk.com/forum/gate-info-announcements.248/
 * http://www.glocktalk.com/forum/gate-self-defense-forum.256/
 * http://www.glocktalk.com/forum/gate-ar-15-forum.257/
 * http://www.glocktalk.com/forum/gate-nfa-class-iii.251/
 * http://www.glocktalk.com/forum/gate-long-range-shooting.252/
 * http://www.glocktalk.com/forum/gate-survival-preparedness.255/
 * http://www.glocktalk.com/forum/gate-night-vision-forum.259/
 * http://www.glocktalk.com/forum/firearms-listings.39/
 * http://www.glocktalk.com/forum/gun-parts-accessories.40/
 * http://www.glocktalk.com/forum/edged-weapons-related-items.46/
 * http://www.glocktalk.com/forum/holsters-related-items.41/
 * http://www.glocktalk.com/forum/misc-gun-stuff.42/
 * http://www.glocktalk.com/forum/wanted-to-buy.45/
 * http://www.glocktalk.com/forum/gun-parts-access.22/
 * 
 * @author karanjeets
 *
 */
public class GlocktalkBasic implements InteractiveSeleniumHandler {

    public String processDriver(WebDriver driver) {
        StringBuffer buffer = new StringBuffer();

        // Extract content - getText doesn't return any links
        String content = driver.findElement(By.tagName("body")).getText();
        buffer.append(content).append("\n");

        // Extract threads from a forum
        if (driver.getCurrentUrl().startsWith("http://www.glocktalk.com/forum")) {
            List<WebElement> threadTopics = driver.findElements(By.xpath("//h3[@class='title']"));

            for (WebElement element : threadTopics) {
                WebElement anchor = element.findElement(By.tagName("a"));
                //System.out.println(anchor.getAttribute("href"));
                buffer.append("<a href=\"").append(anchor.getAttribute("href")).append("\" />").append("\n");
            }

        }

        // Extract Next Page Link
        List<WebElement> nav = driver.findElements(By.xpath("//div[@class='PageNav']//nav//*[@class='text']"));
        //System.out.println(nav.get(nav.size() - 1).getAttribute("href"));
        if (nav.size() > 0)
            buffer.append("<a href=\"").append(nav.get(nav.size() - 1).getAttribute("href")).append("\" />")
                    .append("\n");

        // Extract all links from a thread
        if (driver.getCurrentUrl().startsWith("http://www.glocktalk.com/threads")) {
            List<WebElement> links = driver.findElements(By.xpath("//ol[@id='messageList']//*[@href]"));
            Set<String> filteredLinks = new HashSet<String>();
            for (WebElement link : links) {
                String linkValue = "<a href=\"" + link.getAttribute("href") + "\" />";
                if (linkValue.startsWith("http://www.glocktalk.com/account")
                        || linkValue.startsWith("http://www.glocktalk.com/misc")
                        || linkValue.startsWith("http://www.glocktalk.com/goto")
                        || linkValue.startsWith("http://www.glocktalk.com/social-forums")
                        || linkValue.startsWith("http://www.glocktalk.com/search") || linkValue.contains("#"))
                    continue;
                //System.out.println(linkValue);
                filteredLinks.add(linkValue);
            }
            if (filteredLinks.size() > 0)
                buffer.append(filteredLinks).append("\n");
        }
        //System.out.println();
        return buffer.toString();
    }

    public boolean shouldProcessURL(String URL) {
        if (URL.startsWith("http://www.glocktalk.com/forum") || URL.startsWith("http://www.glocktalk.com/threads"))
            return true;
        return false;
    }

    public static void main(String[] args) {
        GlocktalkBasic glocktalk = new GlocktalkBasic();
        WebDriver driver = new FirefoxDriver();
        try {
            driver.get("http://www.glocktalk.com/forum/general-firearms-forum.82/");
            System.out.println(new String(glocktalk.processDriver(driver).getBytes("UTF-8")));
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (driver != null) {
                driver.close();
                driver.quit();
            }
        }
    }

}