org.archive.modules.fetcher.BdbCookieStore.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.modules.fetcher.BdbCookieStore.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.fetcher;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.SortedMap;

import org.apache.commons.collections.collection.CompositeCollection;
import org.apache.http.client.CookieStore;
import org.apache.http.cookie.Cookie;
import org.archive.bdb.BdbModule;
import org.archive.checkpointing.Checkpoint;
import org.springframework.beans.factory.annotation.Autowired;

import com.google.common.net.InternetDomainName;
import com.sleepycat.bind.ByteArrayBinding;
import com.sleepycat.bind.serial.SerialBinding;
import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.collections.StoredCollection;
import com.sleepycat.collections.StoredSortedMap;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseException;

/**
 * Cookie store using bdb for storage. Cookies are stored in a SortedMap keyed
 * by {@link #sortableKey(Cookie)}, so they are grouped together by domain.
 * {@link #cookieStoreFor(String)} returns a facade whose
 * {@link CookieStore#getCookies()} returns a list of cookies limited to
 * the supplied host and parent domains, if applicable.
 * 
 * @see https://webarchive.jira.com/browse/HER-2070
 * @see https://github.com/internetarchive/heritrix3/pull/96
 * @see https://groups.yahoo.com/neo/groups/archive-crawler/conversations/messages/8620
 * 
 * @contributor nlevitt
 */
public class BdbCookieStore extends AbstractCookieStore implements FetchHTTPCookieStore, CookieStore {

    /**
     * A {@link List} implementation that wraps a {@link Collection}. Needed
     * because httpclient requires {@code List<Cookie>}.
     * 
     * <p>
     * This class is "restricted" in the sense that it is immutable, and also
     * because some methods throw {@link RuntimeException} for other reasons.
     * For example, {@link #iterator()} is not implemented, because we use this
     * class to wrap a bdb {@link StoredCollection}, and iterators from that
     * class need to be explicitly closed. Since this class hides the fact that
     * a StoredCollection underlies it, we simply prevent {@link #iterator()}
     * from being used.
     */
    public static class RestrictedCollectionWrappedList<T> implements List<T> {
        private Collection<T> wrapped;

        public RestrictedCollectionWrappedList(Collection<T> wrapped) {
            this.wrapped = wrapped;
        }

        @Override
        public int size() {
            return wrapped.size();
        }

        @Override
        public boolean isEmpty() {
            throw new RuntimeException("not implemented");
        }

        @Override
        public boolean contains(Object o) {
            throw new RuntimeException("not implemented");
        }

        @Override
        public Iterator<T> iterator() {
            throw new RuntimeException("not implemented");
        }

        @Override
        public Object[] toArray() {
            return wrapped.toArray();
        }

        @SuppressWarnings("hiding")
        @Override
        public <T> T[] toArray(T[] a) {
            return wrapped.toArray(a);
        }

        @Override
        public boolean add(T e) {
            throw new RuntimeException("immutable list");
        }

        @Override
        public boolean remove(Object o) {
            throw new RuntimeException("immutable list");
        }

        @Override
        public boolean containsAll(Collection<?> c) {
            return wrapped.containsAll(c);
        }

        @Override
        public boolean addAll(Collection<? extends T> c) {
            throw new RuntimeException("immutable list");
        }

        @Override
        public boolean addAll(int index, Collection<? extends T> c) {
            throw new RuntimeException("immutable list");
        }

        @Override
        public boolean removeAll(Collection<?> c) {
            throw new RuntimeException("immutable list");
        }

        @Override
        public boolean retainAll(Collection<?> c) {
            throw new RuntimeException("immutable list");
        }

        @Override
        public void clear() {
            throw new RuntimeException("immutable list");
        }

        @Override
        public T get(int index) {
            throw new RuntimeException("not implemented");
        }

        @Override
        public T set(int index, T element) {
            throw new RuntimeException("immutable list");
        }

        @Override
        public void add(int index, T element) {
            throw new RuntimeException("immutable list");
        }

        @Override
        public T remove(int index) {
            throw new RuntimeException("immutable list");
        }

        @Override
        public int indexOf(Object o) {
            throw new RuntimeException("not implemented");
        }

        @Override
        public int lastIndexOf(Object o) {
            throw new RuntimeException("not implemented");
        }

        @Override
        public ListIterator<T> listIterator() {
            throw new RuntimeException("not implemented");
        }

        @Override
        public ListIterator<T> listIterator(int index) {
            throw new RuntimeException("not implemented");
        }

        @Override
        public List<T> subList(int fromIndex, int toIndex) {
            throw new RuntimeException("not implemented");
        }
    }

    protected BdbModule bdb;

    @Autowired
    public void setBdbModule(BdbModule bdb) {
        this.bdb = bdb;
    }

    public static String COOKIEDB_NAME = "hc_httpclient_cookies";

    private transient Database cookieDb;
    private transient StoredSortedMap<byte[], Cookie> cookies;

    public void prepare() {
        try {
            StoredClassCatalog classCatalog = bdb.getClassCatalog();
            BdbModule.BdbConfig dbConfig = new BdbModule.BdbConfig();
            dbConfig.setTransactional(false);
            dbConfig.setAllowCreate(true);
            dbConfig.setSortedDuplicates(false);
            cookieDb = bdb.openDatabase(COOKIEDB_NAME, dbConfig, isCheckpointRecovery);
            cookies = new StoredSortedMap<byte[], Cookie>(cookieDb, new ByteArrayBinding(),
                    new SerialBinding<Cookie>(classCatalog, Cookie.class), true);
        } catch (DatabaseException e) {
            throw new RuntimeException(e);
        }
    }

    public void addCookieImpl(Cookie cookie) {
        byte[] key;
        try {
            key = sortableKey(cookie).getBytes("UTF-8");
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e); // impossible
        }

        if (!cookie.isExpired(new Date())) {
            cookies.put(key, cookie);
        } else {
            cookies.remove(key);
        }
    }

    protected Collection<Cookie> hostSubset(String host) {
        try {
            byte[] startKey = (host + ";").getBytes("UTF-8");

            char chAfterDelim = (char) (((int) ';') + 1);
            byte[] endKey = (host + chAfterDelim).getBytes("UTF-8");

            SortedMap<byte[], Cookie> submap = cookies.subMap(startKey, endKey);
            return submap.values();

        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e); // impossible
        }
    }

    /**
     * Returns a {@link LimitedCookieStoreFacade} whose
     * {@link LimitedCookieStoreFacade#getCookies()} method returns only cookies
     * from {@code host} and its parent domains, if applicable.
     */
    public CookieStore cookieStoreFor(String host) {
        CompositeCollection cookieCollection = new CompositeCollection();

        if (InternetDomainName.isValid(host)) {
            InternetDomainName domain = InternetDomainName.from(host);

            while (domain != null) {
                Collection<Cookie> subset = hostSubset(domain.toString());
                cookieCollection.addComposited(subset);

                if (domain.hasParent()) {
                    domain = domain.parent();
                } else {
                    domain = null;
                }
            }
        } else {
            Collection<Cookie> subset = hostSubset(host.toString());
            cookieCollection.addComposited(subset);
        }

        @SuppressWarnings("unchecked")
        List<Cookie> cookieList = new RestrictedCollectionWrappedList<Cookie>(cookieCollection);
        LimitedCookieStoreFacade store = new LimitedCookieStoreFacade(cookieList);
        return store;
    }

    @Override
    public void startCheckpoint(Checkpoint checkpointInProgress) {
        // do nothing; handled by map checkpoint via BdbModule
    }

    @Override
    public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException {
        // do nothing; handled by map checkpoint via BdbModule
    }

    @Override
    public void finishCheckpoint(Checkpoint checkpointInProgress) {
        // do nothing; handled by map checkpoint via BdbModule
    }

    /** are we a checkpoint recovery? (in which case, reuse stored cookie data?) */
    protected boolean isCheckpointRecovery = false;

    @Override
    public void setRecoveryCheckpoint(Checkpoint recoveryCheckpoint) {
        // just remember that we are doing checkpoint-recovery;
        // actual state recovery happens via BdbModule
        isCheckpointRecovery = true;
    }

    @Override
    public void clear() {
        cookies.clear();
    }

    /**
     * @return an immutable list view of the cookies
     */
    @Override
    public List<Cookie> getCookies() {
        if (cookies != null) {
            return new RestrictedCollectionWrappedList<Cookie>(cookies.values());
        } else {
            return null;
        }
    }

    @Override
    public boolean clearExpired(Date date) {
        throw new RuntimeException("not implemented");
    }
}