hello.Scraper.java Source code

Java tutorial

Introduction

Here is the source code for hello.Scraper.java

Source

/*
 * Copyright 2014 NAKANO Hideo.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package hello;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeVisitor;
import org.springframework.http.ResponseEntity;
import org.springframework.integration.annotation.Filter;
import org.springframework.integration.annotation.MessageEndpoint;
import org.springframework.integration.annotation.Splitter;
import org.springframework.integration.annotation.Transformer;

import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@MessageEndpoint
public class Scraper {
    private final Pattern patter = Pattern.compile("^<li>\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2} \\S+");

    @Splitter(inputChannel = "channel1", outputChannel = "channel2")
    public List<Element> scrape(ResponseEntity<String> payload) {
        String html = payload.getBody();
        final Document htmlDoc = Jsoup.parse(html);
        final Elements anchorNodes = htmlDoc.select("body").select("ul").select("li");

        final List<Element> anchorList = new ArrayList<Element>();
        anchorNodes.traverse(new NodeVisitor() {
            @Override
            public void head(org.jsoup.nodes.Node node, int depth) {
                if (node instanceof org.jsoup.nodes.Element) {
                    Element e = (Element) node;
                    anchorList.add(e);
                }
            }

            @Override
            public void tail(Node node, int depth) {
            }
        });

        return anchorList;
    }

    @Filter(inputChannel = "channel2", outputChannel = "channel3")
    public boolean filter(Element payload) {
        Matcher m = patter.matcher(payload.toString());
        return m.find();
    }

    @Transformer(inputChannel = "channel3", outputChannel = "channel4")
    public DumpEntry convert(Element payload) throws ParseException {
        String dateStr = payload.ownText().substring(0, 19);

        DateFormat format = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
        format.setTimeZone(TimeZone.getTimeZone("GMT"));

        Date timestamp = format.parse(dateStr);

        Elements list = payload.select("a");
        String id;
        String ref;
        if (list.size() > 0) {
            Element a = list.get(0);
            id = a.ownText();
            ref = a.attr("href");
        } else {
            id = "private data";
            ref = null;
        }

        Element span = payload.select("span").get(0);
        String status = span.ownText();

        return new DumpEntry(timestamp, id, ref, status);
    }
}