no.trank.openpipe.wikipedia.producer.meta.RssMetaParser.java Source code

Java tutorial

Introduction

Here is the source code for no.trank.openpipe.wikipedia.producer.meta.RssMetaParser.java

Source

/*
 * Copyright 2007  T-Rank AS
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package no.trank.openpipe.wikipedia.producer.meta;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.util.DateParseException;
import org.apache.commons.httpclient.util.DateUtil;

/**
 * @version $Revision$
 */
public class RssMetaParser {
    private static Pattern PATT_RSS = Pattern
            .compile("<channel>\\s*<title>(.+)</title>\\s*<link>(.+/(\\w+)/(\\d+))</link>.*"
                    + "<item>.*<pubDate>(.+)</pubDate>\\s*</item>\\s*</channel>", Pattern.DOTALL);
    private final URL fileUrl;
    private final URL md5Url;
    private final Date modifiedDate;
    private String fileName;

    public RssMetaParser(String rss) throws MalformedURLException {
        final Matcher matcher = PATT_RSS.matcher(rss);
        if (!matcher.find()) {
            throw new IllegalArgumentException(
                    "input does not match pattern '" + PATT_RSS.pattern() + "' was: [" + rss + ']');
        }
        final URL baseUrl = new URL(matcher.group(2) + '/');
        final String filePrefix = matcher.group(3) + '-' + matcher.group(4);
        fileName = filePrefix + '-' + matcher.group(1);
        fileUrl = new URL(baseUrl, fileName);
        md5Url = new URL(baseUrl, filePrefix + "-md5sums.txt");
        try {
            modifiedDate = DateUtil.parseDate(matcher.group(5));
        } catch (DateParseException e) {
            throw new IllegalArgumentException(e);
        }
    }

    public URL getFileUrl() {
        return fileUrl;
    }

    public URL getMd5Url() {
        return md5Url;
    }

    public String findMd5(String md5sums) {
        if (md5sums != null) {
            final Matcher matcher = Pattern.compile("^([0-9a-fA-F]{32})\\s+" + fileName + '$', Pattern.MULTILINE)
                    .matcher(md5sums);
            if (matcher.find()) {
                return matcher.group(1);
            }
        }
        return null;
    }

    public Date getModifiedDate() {
        return modifiedDate;
    }
}