de.tudarmstadt.ukp.wikipedia.api.PageIterator.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.wikipedia.api.PageIterator.java

Source

/*******************************************************************************
* Copyright (c) 2010 Torsten Zesch.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU Lesser Public License v3
* which accompanies this distribution, and is available at
* http://www.gnu.org/licenses/lgpl.html
*
* Contributors:
*     Torsten Zesch - initial API and implementation
*     Samy Ateia - Improved performance
*        see http://groups.google.com/group/jwpl/browse_thread/thread/79393bdd9fb84de9
******************************************************************************/
package de.tudarmstadt.ukp.wikipedia.api;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.hibernate.Session;
import org.hibernate.criterion.Restrictions;

import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException;

/**
 * An iterator over page objects.
 *
 * @author zesch
 * @author Oliver Ferschke
 *
 */
public class PageIterator implements Iterator<Page> {

    private final Log logger = LogFactory.getLog(getClass());

    private final PageBuffer buffer;

    public PageIterator(Wikipedia wiki, Set<String> ids, Set<String> titles, int bufferSize) {
        buffer = new PageBuffer(bufferSize, wiki, ids, titles);
    }

    public PageIterator(Wikipedia wiki, boolean onlyArticles, int bufferSize) {
        buffer = new PageBuffer(bufferSize, wiki, onlyArticles);
    }

    @Override
    public boolean hasNext() {
        return buffer.hasNext();
    }

    @Override
    public Page next() {
        return buffer.next();
    }

    @Override
    public void remove() {
        throw new UnsupportedOperationException();
    }

    /**
     * Buffers pages in a list.
     *
     * @author zesch
     * @author Oliver Ferschke
     *
     */
    class PageBuffer {

        private final Wikipedia wiki;
        private final boolean onlyArticles;

        private final List<Page> buffer;
        private final int maxBufferSize; // the number of pages to be buffered after a query to the database.
        private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements
        private int bufferOffset; // the offset in the buffer
        private long lastPage;// the overall offset in the data

        private List<String> pageIds = new LinkedList<String>(); // a set of ids, if a specific list of articles is supposed to be read
        private List<String> pageTitles = new LinkedList<String>(); // a set of titles, if a specific list of articles is supposed to be read
        boolean loadFromList;

        public PageBuffer(int bufferSize, Wikipedia wiki, boolean onlyArticles) {
            this.maxBufferSize = bufferSize;
            this.wiki = wiki;
            this.onlyArticles = onlyArticles;
            this.buffer = new ArrayList<Page>();
            this.bufferFillSize = 0;
            this.bufferOffset = 0;
            this.lastPage = 0;
            this.loadFromList = false;
            //TODO test whether this works when zero pages are retrieved
        }

        public PageBuffer(int bufferSize, Wikipedia wiki, Set<String> ids, Set<String> titles) {
            this.maxBufferSize = bufferSize;
            this.wiki = wiki;
            this.buffer = new ArrayList<Page>();
            this.onlyArticles = false;
            this.bufferFillSize = 0;
            this.bufferOffset = 0;
            this.lastPage = 0;
            this.pageIds = new LinkedList<String>(ids);
            this.pageTitles = new LinkedList<String>(titles);
            this.loadFromList = true;
        }

        /**
         * If there are elements in the buffer left, then return true.
         * If the end of the filled buffer is reached, then try to load new buffer.
         * @return True, if there are pages left. False otherwise.
         */
        public boolean hasNext() {
            if (bufferOffset < bufferFillSize) {
                return true;
            } else {
                return this.fillBuffer();
            }
        }

        /**
         *
         * @return The next Page or null if no more pages are available.
         */
        public Page next() {
            // if there are still elements in the buffer, just retrieve the next one
            if (bufferOffset < bufferFillSize) {
                return this.getBufferElement();
            }
            // if there are no more elements => try to fill a new buffer
            else if (this.fillBuffer()) {
                return this.getBufferElement();
            } else {
                // if it cannot be filled => return null
                return null;
            }
        }

        private Page getBufferElement() {
            Page page = buffer.get(bufferOffset);
            bufferOffset++;
            return page;
        }

        //      private void showBuffer() {
        //         for (Page p : buffer) {
        //            try {
        //               logger.info(p.getTitle().getPlainTitle());
        //            } catch (WikiTitleParsingException e) {
        //               e.printStackTrace();
        //            }
        //         }
        //      }

        private boolean fillBuffer() {

            //decide whether to load from list or retrieve all available articles
            if (loadFromList) {
                // clear the old buffer and all variables regarding the state of the buffer
                buffer.clear();
                bufferOffset = 0;
                bufferFillSize = 0;

                //load pages
                if (pageIds.isEmpty() && pageTitles.isEmpty()) {
                    return false;
                }

                while (bufferFillSize <= maxBufferSize && !pageIds.isEmpty()) {
                    String id = pageIds.remove(0);
                    if (id != null && !id.isEmpty()) {
                        try {
                            buffer.add(wiki.getPage(Integer.parseInt(id)));
                            bufferFillSize++;
                        } catch (WikiApiException e) {
                            logger.warn("Missing article with id " + id);
                        }
                    }
                }
                while (bufferFillSize <= maxBufferSize && !pageTitles.isEmpty()) {
                    String title = pageTitles.remove(0);
                    if (title != null && !title.isEmpty()) {
                        try {
                            buffer.add(wiki.getPage(title));
                            bufferFillSize++;
                        } catch (WikiApiException e) {
                            logger.warn("Missing article with title \"" + title + "\"");
                        }
                    }
                }

                if (buffer.size() > 0) {
                    bufferFillSize = buffer.size();
                    return true;
                } else {
                    return false;
                }
            } else {
                Session session = this.wiki.__getHibernateSession();
                session.beginTransaction();
                List returnValues = null;
                if (onlyArticles) {
                    returnValues = session.createCriteria(de.tudarmstadt.ukp.wikipedia.api.hibernate.Page.class)
                            .add(Restrictions.eq("isDisambiguation", false)).add(Restrictions.gt("id", lastPage))
                            .setMaxResults(maxBufferSize).list();
                } else {
                    returnValues = session.createCriteria(de.tudarmstadt.ukp.wikipedia.api.hibernate.Page.class)
                            .add(Restrictions.gt("id", lastPage)).setMaxResults(maxBufferSize).list();
                }
                session.getTransaction().commit();

                // clear the old buffer and all variables regarding the state of the buffer
                buffer.clear();
                bufferOffset = 0;
                bufferFillSize = 0;

                Page apiPage;
                for (Object o : returnValues) {
                    if (o == null) {
                        return false;
                    } else {
                        de.tudarmstadt.ukp.wikipedia.api.hibernate.Page hibernatePage = (de.tudarmstadt.ukp.wikipedia.api.hibernate.Page) o;
                        long id = hibernatePage.getId();
                        try {
                            apiPage = new Page(this.wiki, id, hibernatePage);
                            if (this.onlyArticles) {
                                if (!apiPage.isRedirect()) {
                                    buffer.add(apiPage);
                                }
                            } else {
                                buffer.add(apiPage);
                            }
                        } catch (WikiApiException e) {
                            logger.error("Page with hibernateID " + id + " not found.");
                            e.printStackTrace();
                        }
                        lastPage = id;
                    }
                }
                if (buffer.size() > 0) {
                    bufferFillSize = buffer.size();
                    return true;
                } else {
                    return false;
                }
            }
        } // fillBuffer

    }
}