org.wikimedia.analytics.kraken.pageview.PageviewCanonical.java Source code

Java tutorial

Introduction

Here is the source code for org.wikimedia.analytics.kraken.pageview.PageviewCanonical.java

Source

/**
 *Copyright (C) 2012-2013  Wikimedia Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.wikimedia.analytics.kraken.pageview;

import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URLEncodedUtils;

import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.List;
import java.util.ListIterator;

/**
 * This class contains detailed business logic to simplify a the url of a valid pageview into the canonical title
 * of the page.
 */
public class PageviewCanonical {

    private Charset charset = Charset.defaultCharset();

    private URL url;

    private PageviewType pageviewType;

    private String articleTitle;

    /**
     *
     * @param url
     */
    public PageviewCanonical(final URL url) {
        this.url = url;
        pageviewType = PageviewType.determinePageviewType(url);
    }

    /**
     *
     */
    public final void canonicalize(final String mode) {
        switch (pageviewType) {

        case API:
            extractMediawikiApiTitle(mode);

        case REGULAR:
            extractMediawikiRegularTitle(mode);

        case BLOG:
            canonicalizeBlogPageview();

        case IMAGE:
            canonicalizeImagePageview();

        case BANNER:
            break;

        case SEARCH:
            canonicalizeSearchQuery();

        case OTHER:
            break;

        case NONE:
            break;

        default:
            break;
        }
    }

    /**
     *
     * @return
     */
    private void extractMediawikiRegularTitle(final String mode) {
        if (url != null && url.getPath() != null) {
            if (url.getPath().contains("/wiki/")) {
                this.articleTitle = url.getPath().replaceAll("/wiki/", "");
            } else if (url.getQuery() != null && url.getPath().contains("index.php")) {
                String[] keys = { "title" };
                if (mode.equals("default")) {
                    this.articleTitle = searchQueryAction(keys);
                } else {
                    this.articleTitle = url.getPath();
                }
            }
        }
    }

    /**
     *
     * @param url
     * @return
     * @throws MalformedURLException
     */
    private URL fixApacheHttpComponentBug(final URL url) throws MalformedURLException {
        return new URL(url.toString().replace(";", "&"));
    }

    /**
     * Enter one or more keys to search for, this list of keys is
     * interpreted as key1 or key2; this function is not intended
     * to retrieve the values of multiple keys. In that case,
     * call this function multiple times.
     * @param keys
     * @return
     */
    private String searchQueryAction(final String[] keys) {
        try {
            URL pURL = fixApacheHttpComponentBug(url);

            List<NameValuePair> qparams = URLEncodedUtils.parse(pURL.toURI(), "utf-8");
            ListIterator<NameValuePair> it = qparams.listIterator();
            while (it.hasNext()) {
                NameValuePair nvp = it.next();
                for (String key : keys) {
                    if (nvp.getName().equals(key)) {
                        return nvp.getValue();
                    }
                }
            }
        } catch (URISyntaxException e) {
            return "key.not.found";
        } catch (MalformedURLException e) {
            return "malformed.url";
        }
        return "key.not.found";
    }

    /**
     * @return
     */
    private void extractMediawikiApiTitle(final String mode) {
        if (url != null && url.getQuery() != null) {
            if (mode.equals("default")) {
                String[] keys = { "page", "titles" };
                String tempTitle = searchQueryAction(keys);
                this.articleTitle = convertApiTitleToRegularArticleTitle(tempTitle);
            } else {
                this.articleTitle = url.getPath();
            }
        }
    }

    /**
     *
     * @param apiTitle
     * @return
     */
    private String convertApiTitleToRegularArticleTitle(final String apiTitle) {
        return apiTitle.replaceAll(" ", "_");
    }

    /**
     * This function is specifically written for PageviewType.IMAGE pageviews.
     * @param url
     * @return
     */
    private String parsePath(final URL url) {
        // http://upload.wikimedia.org/wikipedia/commons/thumb/8/87/Nakhalfarms.jpg/220px-Nakhalfarms.jpg
        String pathWithoutPrefix = url.getPath().replaceAll("/wikipedia/[a-z]*/thumb/[a-z0-9]{1}/[a-z0-9]{2}/", "");
        int positionRightSlash = pathWithoutPrefix.lastIndexOf("/");
        String pathWithoutThumb;
        if (positionRightSlash > 0) {
            pathWithoutThumb = pathWithoutPrefix.substring(0, positionRightSlash);
        } else {
            pathWithoutThumb = pathWithoutPrefix;
        }

        String path;
        if (!pathWithoutThumb.endsWith(".jpg") || !pathWithoutThumb.endsWith(".png")
                || !pathWithoutThumb.endsWith(".svg")) {
            positionRightSlash = pathWithoutThumb.lastIndexOf("/");
            if (positionRightSlash > 0) {
                path = pathWithoutThumb.substring(0, positionRightSlash);
            } else {
                path = pathWithoutThumb;
            }
        } else {
            path = pathWithoutThumb;
        }
        return path;
    }

    /**
     *
     * @return
     */
    private void canonicalizeBlogPageview() {
        //TODO not yet implemented
    }

    /**
     *
     * @return
     */
    private void canonicalizeSearchQuery() {
        //TODO not yet implemented
    }

    /**
     * This function canonicalizes an imageview as follows:
     * Given thumbail view https://upload.wikimedia.org/wikipedia/commons/thumb/1/19/Acueducto_de_Segovia_01.jpg/600px-Acueducto_de_Segovia_01.jpg
     * that becomes upload Acueducto_de_Segovia_01.jpg
     * @return
     */
    public final void canonicalizeImagePageview() {
        //TODO implementation not yet finished.
        String path = parsePath(url);
    }

    /**
     *
     * @return
     */
    public final String getArticleTitle() {
        return articleTitle;
    }
}