org.aliuge.crawler.url.WebURL.java Source code

Introduction

Here is the source code for org.aliuge.crawler.url.WebURL.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.aliuge.crawler.url;

import java.io.Serializable;
import org.apache.commons.lang3.StringUtils;
import com.google.common.base.CharMatcher;

public class WebURL implements Serializable, Cloneable {

    private static final long serialVersionUID = 1L;

    private String url;
    private String name;
    private int docid;
    private int parentDocid;
    private String parentUrl;
    private short depth;
    private String domain;
    private String subDomain;
    private String path;
    private String anchor;
    private byte priority;
    boolean recraw = false;

    public WebURL() {
        super();
    }

    public WebURL(String name, String url) {
        this.name = name;
        this.url = url;
    }

    @Override
    public Object clone() {
        WebURL o = null;
        try {
            o = (WebURL) super.clone();
        } catch (CloneNotSupportedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return o;
    }

    /**
     * Returns the unique document id assigned to this Url.
     */
    public int getDocid() {
        return docid;
    }

    public void setDocid(int docid) {
        this.docid = docid;
    }

    @Override
    public int hashCode() {
        return url.hashCode();
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) {
            return true;
        }
        if (o == null || getClass() != o.getClass()) {
            return false;
        }

        WebURL otherUrl = (WebURL) o;
        return url != null && url.equals(otherUrl.getUrl());

    }

    @Override
    public String toString() {
        return "name:" + name + "\turl:" + url;
    }

    /**
     * Returns the Url string
     */
    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        try {
            this.url = url;

            int domainStartIdx = url.indexOf("//") + 2;
            if (domainStartIdx < 0) {
                domainStartIdx = 0;
            }
            int domainEndIdx = url.indexOf('/', domainStartIdx);
            if (domainEndIdx < domainStartIdx)
                domainEndIdx = url.length();
            domain = url.substring(domainStartIdx, domainEndIdx);
            subDomain = "";
            String[] parts = domain.split("\\.");
            if (parts.length > 2) {
                domain = parts[parts.length - 2] + "." + parts[parts.length - 1];
                int limit = 2;
                if (TLDList.getInstance().contains(domain)) {
                    domain = parts[parts.length - 3] + "." + domain;
                    limit = 3;
                }
                for (int i = 0; i < parts.length - limit; i++) {
                    if (subDomain.length() > 0) {
                        subDomain += ".";
                    }
                    subDomain += parts[i];
                }
            }
            path = url.substring(domainEndIdx);
            int pathEndIdx = path.indexOf('?');
            if (pathEndIdx >= 0) {
                path = path.substring(0, pathEndIdx);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

    }

    /**
     * Returns the unique document id of the parent page. The parent page is the
     * page in which the Url of this page is first observed.
     */
    public int getParentDocid() {
        return parentDocid;
    }

    public void setParentDocid(int parentDocid) {
        this.parentDocid = parentDocid;
    }

    /**
     * Returns the url of the parent page. The parent page is the page in which
     * the Url of this page is first observed.
     */
    public String getParentUrl() {
        return parentUrl;
    }

    public void setParentUrl(String parentUrl) {
        this.parentUrl = parentUrl;
    }

    /**
     * Returns the crawl depth at which this Url is first observed. Seed Urls
     * are at depth 0. Urls that are extracted from seed Urls are at depth 1,
     * etc.
     */
    public short getDepth() {
        return depth;
    }

    public void setDepth(short depth) {
        this.depth = depth;
    }

    /**
     * Returns the domain of this Url. For 'http://www.example.com/sample.htm',
     * domain will be 'example.com'
     */
    public String getDomain() {
        return domain;
    }

    public String getSubDomain() {
        return subDomain;
    }

    /**
     * Returns the path of this Url. For 'http://www.example.com/sample.htm',
     * domain will be 'sample.htm'
     */
    public String getPath() {
        return path;
    }

    public void setPath(String path) {
        this.path = path;
    }

    /**
     * Returns the anchor string. For example, in <a href="example.com">A sample anchor</a>
     * the anchor string is 'A sample anchor'
     */
    public String getAnchor() {
        return anchor;
    }

    public void setAnchor(String anchor) {
        this.anchor = anchor;
    }

    /**
     * Returns the priority for crawling this URL. 
     * A lower number results in higher priority.
     */
    public byte getPriority() {
        return priority;
    }

    public void setPriority(byte priority) {
        this.priority = priority;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        if (StringUtils.isNoneBlank(name))
            this.name = CharMatcher.INVISIBLE.trimFrom(name);
    }

    public boolean isRecraw() {
        return recraw;
    }

    public void setRecraw(boolean recraw) {
        this.recraw = recraw;
    }

}