org.wikipedia.nirvana.nirvanabot.NewPages.java Source code

Java tutorial

Introduction

Here is the source code for org.wikipedia.nirvana.nirvanabot.NewPages.java

Source

/**
 *  @(#)NewPages.java 14.12.2014
 *  Copyright  2011 - 2014 Dmitry Trofimovich (KIN, Nirvanchik, DimaTrofimovich@gmail.com)
 *    
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/**
 * WARNING: This file may contain Russian characters.
 * Recommended code page for this file is CP1251 (also called Windows-1251).
 * */

package org.wikipedia.nirvana.nirvanabot;

import org.wikipedia.Wiki;
import org.wikipedia.Wiki.Revision;
import org.wikipedia.nirvana.DateTools;
import org.wikipedia.nirvana.HTTPTools;
import org.wikipedia.nirvana.NirvanaBasicBot;
import org.wikipedia.nirvana.NirvanaWiki;
import org.wikipedia.nirvana.ServiceError;
import org.wikipedia.nirvana.StringTools;
import org.wikipedia.nirvana.WikiBooster;
import org.wikipedia.nirvana.WikiTools;
import org.wikipedia.nirvana.WikiTools.Service;
import org.wikipedia.nirvana.WikiTools.ServiceFeatures;
import org.wikipedia.nirvana.archive.Archive;
import org.wikipedia.nirvana.archive.ArchiveFactory;
import org.wikipedia.nirvana.archive.ArchiveSettings;
import org.wikipedia.nirvana.archive.ArchiveSettings.Enumeration;
import org.wikipedia.nirvana.archive.ArchiveSimple;
import org.wikipedia.nirvana.archive.ArchiveWithEnumeration;
import org.wikipedia.nirvana.archive.ArchiveWithHeaders;
import org.wikipedia.nirvana.nirvanabot.pagesfetcher.FetcherFactory;
import org.wikipedia.nirvana.nirvanabot.pagesfetcher.PageListFetcher;
import org.wikipedia.nirvana.nirvanabot.pagesfetcher.PageListProcessor;
import org.wikipedia.nirvana.nirvanabot.pagesfetcher.PageListProcessorFast;
import org.wikipedia.nirvana.nirvanabot.pagesfetcher.PageListProcessorSlow;
import org.wikipedia.nirvana.nirvanabot.pagesfetcher.ProcessorCombinator;
import org.wikipedia.nirvana.nirvanabot.pagesfetcher.RevisionWithId;
import org.wikipedia.nirvana.nirvanabot.templates.TemplateFilter;
import org.wikipedia.nirvana.nirvanabot.templates.TemplateFinder;

import org.apache.commons.lang3.StringUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.security.auth.login.LoginException;

/**
 * @author kin
 *
 */
public class NewPages implements PortalModule {
    private static final int WIKI_API_BUNCH_SIZE = 10;
    public static final String PATTERN_NEW_PAGES_LINE_WIKIREF_ARTICLE = "\\[\\[(?<article>[^\\]|]+)(|[^\\]]+)?\\]\\]";
    public static final String PATTERN_NEW_PAGES_LINE_TEMPLATE_ITEM = "\\{\\{(?<template>.+)\\}\\}";
    private static final String COMMENT_DELETED = "<span style=\"color:#966963\">  ''' '''</span>";
    protected String language;
    protected List<String> categories;
    protected List<String> categoriesToIgnore;
    protected List<List<String>> categoryGroups;
    protected List<List<String>> categoryToIgnoreGroups;
    protected Set<String> usersToIgnore;
    protected String pageName;
    protected String archive;
    protected ArchiveSettings archiveSettings;
    protected String format;
    protected String formatString;
    protected String header;
    protected String footer;
    protected String middle;
    protected int maxItems;
    protected int hours;
    protected WikiTools.Service service;
    protected String delimeter;
    protected int depth;
    protected int namespace;
    protected String namespaceIdentifier;
    //protected boolean markEdits;
    protected boolean minor;
    protected boolean bot;
    protected boolean fastMode;

    protected PortalParam.Deleted deletedFlag;
    protected int renamedFlag;

    protected String headerLastUsed;
    protected String footerLastUsed;
    protected String middleLastUsed;

    protected Map<String, String> pageLists;
    protected Map<String, String> pageListsToIgnore;

    protected static org.apache.log4j.Logger log;

    protected GetRevisionMethod getRevisionMethod = GetRevisionMethod.GET_REV;

    protected boolean UPDATE_FROM_OLD = true;
    protected boolean UPDATE_ARCHIVE = true;

    protected String currentUser = null;

    protected boolean supportsTemplateFilter = true;
    protected TemplateFilter templateFilter = null;

    /**
     * True means that list item may contain new page author info.
     */
    protected boolean supportAuthor = true;

    /**
     * This happens when service doesn't support fetching page list with filtering by provided
     * templates list. In this case we filter it by ours. This flag enables manual filtering
     * by template filter.
     */
    protected boolean needsCustomTemlateFiltering = false;

    static {
        log = org.apache.log4j.Logger.getLogger(NewPages.class);
    }

    protected enum GetRevisionMethod {
        GET_FIRST_REV, GET_FIRST_REV_IF_NEED, GET_FIRST_REV_IF_NEED_SAVE_ORIGINAL, GET_REV, GET_TOP_REV
        //GET_LAST_REV_BY_PAGE_NAME
    }

    /**
     *  Constructor
     */
    public NewPages(PortalParam param) {
        this.language = param.lang;
        this.categories = param.categories;
        this.categoriesToIgnore = param.categoriesToIgnore;
        categoryGroups = param.categoryGroups;
        categoryToIgnoreGroups = param.categoryToIgnoreGroups;
        this.usersToIgnore = new HashSet<String>(param.usersToIgnore);
        this.pageName = param.page;
        this.archive = param.archive;
        this.archiveSettings = param.archSettings;
        this.maxItems = param.maxItems;
        this.format = param.format;

        this.formatString = format.replace("%()", "%1$s").replace("%()", "%3$s");
        if (supportAuthor) {
            formatString = formatString.replace("%()", "%2$s");
        }
        /*
        TODO(Nirvanchik): I don't understand what is this for?
        if(param.deletedFlag==PortalParam.Deleted.MARK) {
           this.formatString = formatString.replace("%()", "%4$s");
        }
        */
        this.hours = param.hours;
        //this.Module = module;
        this.delimeter = param.delimeter;
        this.depth = param.depth;
        this.header = param.header;
        if (header == null)
            this.header = "";
        this.headerLastUsed = this.header;
        this.footer = param.footer;
        if (footer == null)
            this.footer = "";
        this.footerLastUsed = this.footer;
        this.middle = param.middle;
        if (middle == null)
            this.middle = "";
        this.middleLastUsed = this.middle;
        this.namespace = param.ns;
        this.minor = param.minor;
        this.bot = param.bot;
        this.deletedFlag = param.deletedFlag;
        this.renamedFlag = param.renamedFlag;
        this.service = param.service;
        this.fastMode = param.fastMode;
        this.templateFilter = param.templateFilter;

        log = org.apache.log4j.Logger.getLogger(this.getClass().getName());
        log.debug("Portal module created for portal subpage [[" + this.pageName + "]]");
    }

    protected String getFormatString() {
        return formatString;
    }

    public String getOldText(Wiki wiki) throws IOException {
        try {
            return wiki.getPageText(this.pageName);
        } catch (java.io.FileNotFoundException e) {
            return "";
        }
    }

    public class Data {
        String newText;
        String archiveText;
        List<String> archiveItems = null;
        int newPagesCount = 0;
        int archiveCount = 0;
        int deletedCount = 0;
        int oldCount = 0;

        public Data() {
            if (archive != null) {
                archiveItems = new ArrayList<String>();
            }
        }

        public void makeArchiveText() {
            if (archive != null && archiveItems != null && archiveItems.size() > 0) {
                if (archiveSettings.enumeration == Enumeration.HASH) {
                    enumerateWithHash(archiveItems);
                }
                archiveText = StringUtils.join(archiveItems.iterator(), delimeter) + "\n";
            }
        }
    }

    private String getPortalName() {
        String str = pageName;
        int colon = str.indexOf(':');
        if (colon >= 0) {
            str = str.substring(colon + 1);
        }
        int slash = str.indexOf('/');
        if (slash > 0) {
            str = str.substring(slash);
        }
        return str;
    }

    protected String substParams(String item, boolean constantOnly) {
        if (item.isEmpty())
            return item;
        String portal = getPortalName();
        Calendar c = Calendar.getInstance();
        String date = DateTools.printDateDayMonthYearGenitiveRussian(c);
        String str = item.replace("%()", getCurrentUser()).replace("%()", portal)
                .replace("%()", portal).replace("%()", pageName);
        if (archiveSettings != null) {
            String archive = archiveSettings.getArchiveForDate(c);
            if (archive != null) {
                str = str.replace("%()", archive);
            }
        }
        if (!constantOnly) {
            str = str.replace("%()", date);
        }
        return str;
    }

    protected class NewPagesBuffer {
        protected NirvanaWiki wiki;
        protected String header;
        protected String middle;
        protected String footer;
        protected List<String> subset = new ArrayList<String>();
        List<String> includedPages = new ArrayList<String>();

        public NewPagesBuffer(NirvanaWiki wiki) {
            this.wiki = wiki;
            this.header = substParams(NewPages.this.header, false);
            this.middle = NewPages.this.middle;
            this.footer = substParams(NewPages.this.footer, false);
        }

        public boolean contains(String item) {
            return subset.contains(item);
        }

        public int size() {
            return subset.size();
        }

        public boolean checkIsDuplicated(String archiveItem) {
            if (contains(archiveItem)) {
                return true;
            }
            for (String title : includedPages) {
                String variant1 = "[[" + title + "]]";
                String variant2 = "|" + pageTitleNormalToEscaped(
                        title.substring((namespace == 0) ? 0 : namespaceIdentifier.length() + 1)) + "|";
                if (archiveItem.contains(variant1) || archiveItem.contains(variant2)) {
                    return true;
                }
            }
            return false;
        }

        public String getNewText(String bots) {
            String text;
            if (middle.isEmpty()) {
                text = bots + header + StringUtils.join(subset.toArray(), delimeter) + footer;
            } else if (subset.size() > 1) {
                int m = (subset.size() + 1) / 2; // 2->1, 3->2, 4->2
                text = bots + header + StringUtils.join(subset.subList(0, m), delimeter) + middle
                        + StringUtils.join(subset.subList(m, subset.size()), delimeter) + footer;
            } else {
                text = bots + header + StringUtils.join(subset, delimeter) + middle + footer;
            }
            return text;
        }

        protected void addOldItem(String item) {
            subset.add(item);
        }

        protected String formatTimeString(Revision rev) {
            if (NirvanaBot.TIME_FORMAT.equalsIgnoreCase("long"))
                return rev.getTimestamp().getTime().toString();
            else {
                return String.format("%1$tFT%1$tTZ", rev.getTimestamp());
            }
        }

        String formatTitleString(String title) {
            String titleToInsert = title;
            if (namespace != 0) {
                log.debug("namespace = " + namespaceIdentifier + " namespace=" + String.valueOf(namespace)
                        + " title=" + title);
                titleToInsert = title.substring(namespaceIdentifier.length() + 1);
            }
            // There are pages like /etc or /dev/null which start with '/'
            if (titleToInsert.startsWith("/")) {
                titleToInsert = ":" + titleToInsert;
            }
            if (format.contains("{{") && format.contains("}}")) {
                titleToInsert = pageTitleNormalToEscaped(titleToInsert); // replaces '=' equal-sign by escape-code
            }
            return titleToInsert;
        }

        String formatItemString(String title, boolean deleted, Revision rev) {
            String element;
            String user = rev.getUser();
            String time = formatTimeString(rev);
            String titleToInsert = formatTitleString(title);

            element = String.format(formatString, titleToInsert, HTTPTools.removeEscape(user), time);
            if (deletedFlag == PortalParam.Deleted.MARK && deleted) {
                element = markDeleted(element);
            }
            return element;
        }

        protected void addNewItem(String title, boolean deleted, Revision rev) throws IOException {
            String element = formatItemString(title, deleted, rev);

            if (!subset.contains(element)) {
                subset.add(element);
                if (includedPages != null) {
                    includedPages.add(title);
                }
                log.debug("ADD new line: \t" + element);
            }
        }

        /*
        protected void addNewItem(String item) throws IOException {
             if (!subset.contains(item))
             {
           subset.add(item);
           if (includedPages != null) {
              includedPages.add(item);
           }
           log.debug("ADD new line: \t"+item);
             }
        }*/

        protected void addNewPage(NirvanaWiki wiki, Revision pageInfo) throws IOException {
            Revision page = null;
            switch (getRevisionMethod) {
            case GET_FIRST_REV:
                page = wiki.getFirstRevision(pageInfo.getPage());
                break;
            case GET_REV:
                page = wiki.getRevision(pageInfo.getRevid());
                break;
            case GET_TOP_REV:
                page = wiki.getTopRevision(pageInfo.getPage());
                break;
            case GET_FIRST_REV_IF_NEED:
                if (usersToIgnore.size() == 0) {
                    page = pageInfo;
                } else {
                    page = wiki.getFirstRevision(pageInfo.getPage());
                }
                break;
            case GET_FIRST_REV_IF_NEED_SAVE_ORIGINAL:
                if (usersToIgnore.size() == 0) {
                    page = pageInfo;
                } else {
                    // TODO (Nirvanchik): here we may lost an important information
                    Revision r = wiki.getFirstRevision(pageInfo.getPage());
                    if (r != null) {
                        pageInfo.setUser(r.getUser());
                    }
                }
                break;
            default:
                throw new Error("not supported mode");
            }

            if (page != null && !usersToIgnore.contains(HTTPTools.removeEscape(page.getUser()))) {
                String title_old = HTTPTools.removeEscape(pageInfo.getPage());
                String title_new = HTTPTools.removeEscape(page.getPage());
                log.debug("check page, title old: " + title_old + ", title new: " + title_new);
                boolean deleted = false;
                if (deletedFlag != PortalParam.Deleted.DONT_TOUCH && service.hasDeleted) {
                    if (!wiki.exists(title_new)) {
                        log.warn("page " + title_new + " deleted"); // page was created and renamed or deleted after that
                        deleted = true;
                        if (deletedFlag == PortalParam.Deleted.REMOVE)
                            return;
                    }
                }

                boolean renamed = !title_new.equals(title_old);

                if (renamed) {
                    if ((renamedFlag & PortalParam.RENAMED_NEW) != 0) {
                        addNewItem(title_new, deleted, page);
                    }
                    if ((renamedFlag & PortalParam.RENAMED_OLD) != 0) {
                        addNewItem(title_old, deleted, page);
                    } else {
                        if (includedPages != null) {
                            includedPages.add(title_old);
                        }
                    }
                } else {
                    addNewItem(title_new, deleted, page);
                }
            }
        }
    }

    protected PageListProcessor createPageListProcessorWithFetcher(WikiTools.Service service,
            PageListFetcher fetcher, List<String> categories, List<String> categoriesToIgnore) {
        if (service.supportsFastMode() && fastMode) {
            return new PageListProcessorFast(service, categories, categoriesToIgnore, language, depth, namespace,
                    fetcher);
        } else {
            return new PageListProcessorSlow(service, categories, categoriesToIgnore, language, depth, namespace,
                    fetcher);
        }
    }

    protected PageListProcessor createPageListFetcherForGroup(List<String> categories,
            List<String> categoriesToIgnore) throws BotFatalError {
        PageListFetcher fetcher;
        needsCustomTemlateFiltering = false;
        if (templateFilter == null) {
            fetcher = createSimpleFetcher();
        } else {
            Service service = this.service;
            if (!service.supportsFeature(WikiTools.ServiceFeatures.NEWPAGES_WITH_TEMPLATE)) {
                service = Service.getDefaultServiceForFeature(ServiceFeatures.NEWPAGES_WITH_TEMPLATE, this.service);
            }
            if (service.supportsFeature(WikiTools.ServiceFeatures.NEWPAGES_WITH_TEMPLATE)) {
                fetcher = new FetcherFactory.NewPagesWithTemplatesFetcher(templateFilter, hours);
            } else {
                needsCustomTemlateFiltering = true;
                fetcher = createSimpleFetcher();
            }
        }
        return createPageListProcessorWithFetcher(this.service, fetcher, categories, categoriesToIgnore);
    }

    private PageListFetcher createSimpleFetcher() throws BotFatalError {
        Service service = this.service;
        if (!service.supportsFeature(ServiceFeatures.NEWPAGES)) {
            service = Service.getDefaultServiceForFeature(ServiceFeatures.NEWPAGES, this.service);
        }
        if (!service.supportsFeature(ServiceFeatures.NEWPAGES)) {
            throw new BotFatalError("No service available for this project page list.");
        }
        return new FetcherFactory.NewPagesFetcher(hours);
    }

    protected PageListProcessor createPageListProcessor() throws BotFatalError {
        List<PageListProcessor> fetchers = new ArrayList<PageListProcessor>(3);
        fetchers.add(createPageListFetcherForGroup(this.categories, this.categoriesToIgnore));
        for (int i = 0; i < categoryGroups.size(); i++) {
            if (categoryGroups.get(i).size() > 0) {
                fetchers.add(createPageListFetcherForGroup(categoryGroups.get(i), categoryToIgnoreGroups.get(i)));
            }
        }
        if (fetchers.size() > 1)
            return new ProcessorCombinator(fetchers);
        else
            return fetchers.get(0);
    }

    public void sortPagesByRevision(ArrayList<Revision> pageInfoList) {
        java.util.Collections.sort(pageInfoList, new Comparator<Revision>() {

            @Override
            public int compare(Revision r1, Revision r2) {
                return (int) (r2.getRevid() - r1.getRevid());
            }

        });
    }

    public void sortPagesById(ArrayList<Revision> pageInfoList) {
        java.util.Collections.sort(pageInfoList, new Comparator<Revision>() {

            @Override
            public int compare(Revision r1, Revision r2) {
                return (int) (((RevisionWithId) r2).getId() - ((RevisionWithId) r1).getId());
            }

        });
    }

    public void sortPagesByName(ArrayList<Revision> pageInfoList) {
        java.util.Collections.sort(pageInfoList, new Comparator<Revision>() {

            @Override
            public int compare(Revision r1, Revision r2) {
                return r1.getPage().compareTo(r2.getPage());
            }

        });
    }

    public void sortPagesByDate(ArrayList<Revision> pageInfoList) {
        java.util.Collections.sort(pageInfoList, new Comparator<Revision>() {

            @Override
            public int compare(Revision r1, Revision r2) {
                return r2.getTimestamp().compareTo(r1.getTimestamp());
            }

        });
    }

    public void sortPages(ArrayList<Revision> pageInfoList, boolean byRevision) {
        if (byRevision) {
            sortPagesByRevision(pageInfoList);
        } else {
            sortPagesById(pageInfoList);
        }
    }

    protected List<Revision> getNewPages(NirvanaWiki wiki)
            throws IOException, InterruptedException, ServiceError, BotFatalError {
        PageListProcessor pageListProcessor = createPageListProcessor();
        log.info("Using pagelist fetcher: " + pageListProcessor);
        if (getRevisionMethod == GetRevisionMethod.GET_REV) {
            if (pageListProcessor.revisionAvailable()) {
                getRevisionMethod = GetRevisionMethod.GET_REV;
            } else {
                getRevisionMethod = GetRevisionMethod.GET_FIRST_REV;
            }
        }
        ArrayList<Revision> pageInfoList = pageListProcessor.getNewPages(wiki);

        pageInfoList = filterPagesByCondition(pageInfoList, wiki);

        sortPages(pageInfoList, pageListProcessor.revisionAvailable());

        if (pageListProcessor.mayHaveDuplicates()) {
            removeDuplicatesInSortedList(pageInfoList);
        }
        return pageInfoList;
    }

    protected ArrayList<Revision> filterPagesByCondition(ArrayList<Revision> pageInfoList, NirvanaWiki wiki)
            throws IOException {
        if (this.templateFilter == null
                || (!templateFilter.paramValueFiltering() && !needsCustomTemlateFiltering)) {
            return pageInfoList;
        }
        ArrayList<Revision> list = new ArrayList<>();
        String templatePrefix = wiki.namespaceIdentifier(Wiki.TEMPLATE_NAMESPACE) + ":";
        WikiBooster booster = WikiBooster.create(wiki, pageInfoList,
                StringTools.addPrefixToList(templateFilter.getTemplates(), templatePrefix));
        TemplateFinder finder = new TemplateFinder(templateFilter.getParamFilterItems(), wiki, booster);
        for (Revision r : pageInfoList) {
            if (finder.find(r.getPage())) {
                list.add(r);
            }
            booster.removePage(r.getPage());
        }
        return list;
    }

    protected String trimRight(String text, String right) {
        String textTrimmed = text;
        if (!right.isEmpty() && textTrimmed.endsWith(right)) {
            textTrimmed = text.substring(0, textTrimmed.length() - right.length());
            textTrimmed = StringTools.trimRight(textTrimmed);
        } else {
            textTrimmed = StringTools.trimRight(textTrimmed);
            if (!right.isEmpty() && textTrimmed.endsWith(right)) {
                textTrimmed = text.substring(0, textTrimmed.length() - right.length());
                textTrimmed = StringTools.trimRight(textTrimmed);
            }
        }
        return textTrimmed;
    }

    protected String trimLeft(String text, String left) {
        String textTrimmed = text;
        if (!left.isEmpty() && textTrimmed.startsWith(left)) {
            textTrimmed = text.substring(left.length());
            textTrimmed = StringTools.trimLeft(textTrimmed);
        } else {
            textTrimmed = StringTools.trimLeft(textTrimmed);
            if (!left.isEmpty() && textTrimmed.startsWith(left)) {
                textTrimmed = textTrimmed.substring(left.length());
                textTrimmed = StringTools.trimLeft(textTrimmed);
            }
        }
        return textTrimmed;
    }

    protected String trimMiddle(String text, String middle) {
        String textTrimmed = text;
        if (!middle.isEmpty()) {
            if (textTrimmed.contains(middle)) {
                textTrimmed = textTrimmed.replace(middle, delimeter);
            } else if (!middle.trim().isEmpty() && textTrimmed.contains(middle.trim())) {
                textTrimmed = textTrimmed.replace(middle.trim(), delimeter);
            }
        }
        return textTrimmed;
    }

    protected String extractBotsAllowString(String text, String botsAllowString) {
        if (botsAllowString != null) {
            int pos = text.indexOf(botsAllowString);
            text = text.substring(0, pos) + text.substring(pos + botsAllowString.length());
        }
        return text;
    }

    public class PageInfo {
        String item;
        String title;
        boolean exists;

        public PageInfo(String item) {
            this.item = item;
            this.exists = true;
            if (deletedFlag == PortalParam.Deleted.REMOVE || deletedFlag == PortalParam.Deleted.MARK) {
                extractTitle();
            }
        }

        private void extractTitle() {
            title = getNewPagesItemArticle(item);
            if (title != null)
                title = pageTitleEscapedToNormal(title);
        }
    }

    private void checkDeleted(NirvanaWiki wiki, List<PageInfo> pages) throws IOException {
        ArrayList<String> titles = new ArrayList<String>();
        boolean pagesExist[];// = new boolean[bunch.size()];
        for (int j = 0; j < pages.size(); j++) {
            if (pages.get(j).title != null) {
                titles.add(pages.get(j).title);
            }
        }
        pagesExist = wiki.exists(titles.toArray(new String[0]));
        for (int j = 0, k = 0; j < pages.size(); j++) {
            if (pages.get(j).title != null) {
                pages.get(j).exists = pagesExist[k];
                k++;
            }
        }
    }

    public void analyzeOldText(NirvanaWiki wiki, String text, Data d, NewPagesBuffer buffer) throws IOException {
        log.debug("analyzing old text");
        String oldText = text;
        String[] oldItems;

        // remove {{bots|allow=}} record
        String botsAllowString = NirvanaWiki.getAllowBotsString(text);

        log.debug("analyzing old text -> trancate header/footer/middle");
        oldText = extractBotsAllowString(oldText, botsAllowString);
        oldText = trimRight(oldText, footerLastUsed);
        oldText = trimLeft(oldText, headerLastUsed);
        oldText = trimMiddle(oldText, middleLastUsed);

        //FileTools.dump(footer, "dump", this.pageName +".footer.txt");

        oldItems = StringUtils.splitByWholeSeparator(oldText, delimeter); // removes empty items
        if (delimeter.equals("\n"))
            log.debug("delimeter is \\n");
        else if (delimeter.equals("\r"))
            log.debug("delimeter is \\r");
        else
            log.debug("delimeter is \"" + delimeter + "\"");

        log.debug("analyzing old text -> parse items, len = " + oldItems.length);

        d.oldCount = oldItems.length;
        int pos = 0;
        while (pos < oldItems.length && UPDATE_FROM_OLD && buffer.size() < maxItems) {
            ArrayList<PageInfo> bunch = new ArrayList<PageInfo>(WIKI_API_BUNCH_SIZE);
            for (; bunch.size() < WIKI_API_BUNCH_SIZE && pos < oldItems.length; pos++) {
                String item = oldItems[pos];
                if (item.isEmpty())
                    continue;
                log.trace("check old line: \t" + item + "");
                if (buffer.checkIsDuplicated(item)) {
                    log.debug("SKIP old line: \t" + item);
                    continue;
                }
                bunch.add(new PageInfo(item));
            }
            if (this.deletedFlag == PortalParam.Deleted.REMOVE || this.deletedFlag == PortalParam.Deleted.MARK) {
                checkDeleted(wiki, bunch);
            }
            int j = 0;
            for (; j < bunch.size() && buffer.size() < maxItems; j++) {
                String item = bunch.get(j).item;
                if (this.deletedFlag == PortalParam.Deleted.REMOVE && !bunch.get(j).exists) {
                    log.debug("REMOVE old line: \t" + item);
                    d.deletedCount++;
                } else {
                    if (this.deletedFlag == PortalParam.Deleted.MARK) {
                        item = bunch.get(j).exists ? unmarkDeleted(item) : markDeleted(item);
                    }
                    log.debug("ADD old line: \t" + item);
                    buffer.addOldItem(item);
                }
            }
            pos = pos - bunch.size() + j;
        }
        for (; pos < oldItems.length; pos++) {
            String item = oldItems[pos];
            if (item.isEmpty())
                continue;
            log.trace("check old line: \t" + item + "");
            if (buffer.checkIsDuplicated(item)) {
                log.debug("SKIP old line: \t" + item);
                continue;
            }
            log.debug("ARCHIVE old line:" + item);
            if (UPDATE_ARCHIVE && archive != null) {
                d.archiveItems.add(item);
            }
            d.archiveCount++;
        }
        log.debug("bots allow string: " + botsAllowString);
        if (!botsAllowString.isEmpty()) {
            botsAllowString = botsAllowString + "\n";
        }
        d.newText = buffer.getNewText(botsAllowString);
    }

    protected NewPagesBuffer createPagesBuffer(NirvanaWiki wiki) {
        return new NewPagesBuffer(wiki);
    }

    public Data getData(NirvanaWiki wiki, String text)
            throws IOException, InterruptedException, ServiceError, BotFatalError {
        log.info("Get data for [[" + this.pageName + "]]");

        List<Revision> pageInfoList = getNewPages(wiki);

        NewPagesBuffer buffer = createPagesBuffer(wiki);
        int count = pageInfoList.size();
        count = count < maxItems ? count : maxItems;
        for (int i = 0; i < count; ++i) {
            buffer.addNewPage(wiki, pageInfoList.get(i));
        }

        // Add elements from old page
        Data d = new Data();

        if (true/*count < maxItems /*|| archive!=null*/) {
            analyzeOldText(wiki, text, d, buffer);
        }

        if (UPDATE_ARCHIVE)
            d.makeArchiveText();

        //archiveItems.i
        d.newPagesCount = buffer.size() - (d.oldCount - d.archiveCount - d.deletedCount);
        if (d.newPagesCount < 0)
            d.newPagesCount = 0;
        log.debug("updated items count: " + buffer.size());
        log.debug("old items count: " + d.oldCount);
        log.debug("archive count: " + d.archiveCount);
        log.debug("deleted count: " + d.deletedCount);
        log.debug("new items count: " + d.newPagesCount);

        return d;
    }

    private void removeDuplicatesInSortedList(ArrayList<Revision> list) {
        log.debug("removing duplicates from list");
        int i = 1;
        while (i < list.size()) {
            if (list.get(i).getPage().equals(list.get(i - 1).getPage())) {
                list.remove(i);
            } else {
                i++;
            }
        }
    }

    public void enumerateWithHash(List<String> list) {
        for (int i = 0; i < list.size(); i++) {
            String item = list.get(i);
            if (item.startsWith("#")) {
                // do nothing
            } else if (item.startsWith("*")) {
                item = "#" + item.substring(1);
            } else {
                item = "# " + item;
            }
            list.set(i, item);
        }
        return;
    }

    protected String getCurrentUser(NirvanaWiki wiki) {
        if (this.currentUser == null) {
            Wiki.User user = wiki.getCurrentUser();
            this.currentUser = user.getUsername();
        }
        return this.currentUser;
    }

    protected String getCurrentUser() {
        return this.currentUser;
    }

    protected boolean checkAllowBots(NirvanaWiki wiki, String text) {

        log.debug("current user retrieved");
        if (!NirvanaWiki.allowBots(text, getCurrentUser(wiki))) {
            //reportData.status = Status.DENIED;
            log.info("bots/nobots template forbids updating this portal section");
            return false;
        }
        return true;
    }

    @Override
    public boolean update(NirvanaWiki wiki, ReportItem reportData, String comment)
            throws IOException, LoginException, InterruptedException, ServiceError, BotFatalError {
        log.debug("=> update()");
        this.namespaceIdentifier = wiki.namespaceIdentifier(this.namespace);
        boolean updated = false;
        String text = getOldText(wiki);
        log.debug("old text retrieved");

        if (!checkAllowBots(wiki, text)) {
            return false;
        }

        getHeaderFooterChanges(wiki, reportData.template, reportData.portal);

        Data d = getData(wiki, text);
        reportData.pagesArchived = d.archiveCount;
        reportData.newPagesFound = d.newPagesCount;

        if (text == null) {
            log.trace("text = null");
            text = "";
        }

        if (d.newText == null) {
            log.trace("d.newText = null");
            return false;
        }

        if (d.newText != null && !d.newText.isEmpty()
                && !(d.newText.equals(text) || d.newText.equals(text.trim()))) {

            String str = "+" + String.valueOf(d.newPagesCount) + " ";
            if (UPDATE_ARCHIVE && archive != null && d.archiveCount > 0) {
                str = str + ", -" + d.archiveCount + "  ";
            }
            if (this.deletedFlag == PortalParam.Deleted.REMOVE && d.deletedCount > 0) {
                str = str + ", -" + d.deletedCount + " ";
            }
            log.info("Updating [[" + this.pageName + "]] " + str);
            wiki.edit(pageName, d.newText, str, this.minor, this.bot);
            updated = true;
            reportData.updated = updated;
            //wiki.Save(Page, newText, Module.UpdateComment, !MarkEdits ? MinorFlags.NotMinor : MinorFlags.None, MarkEdits);
            if (UPDATE_ARCHIVE && archive != null && (d.archiveText != null || d.archiveItems.size() > 0)) {
                waitPauseIfNeed();
                log.info("Updating archive");
                updateArchive(wiki, d, reportData);
            }
        }
        return updated;
    }

    protected void waitPauseIfNeed() throws InterruptedException {
        if (NirvanaBot.UPDATE_PAUSE > 0) {
            log.debug("Waiting " + NirvanaBot.UPDATE_PAUSE + " ms");
            Thread.sleep(NirvanaBot.UPDATE_PAUSE);
        }
    }

    /**
     * @throws IOException 
      * 
      */
    private void getHeaderFooterChanges(NirvanaWiki wiki, String template, String portalSettingsPage)
            throws IOException {
        Revision rNewPages = wiki.getTopRevision(pageName);
        if (rNewPages == null) {
            return;
        }

        Revision[] revs = wiki.getPageHistory(portalSettingsPage, Calendar.getInstance(), rNewPages.getTimestamp());
        if (revs.length == 0) {
            return;
        }

        Revision r = revs[revs.length - 1].getPrevious();

        if (r == null) {
            return;
        }
        // get last used header/footer
        log.info("portal params were changed after last use");

        String settingsText = r.getText();
        Map<String, String> options = new HashMap<String, String>();
        String userNamespace = wiki.namespaceIdentifier(Wiki.USER_NAMESPACE);
        if (NirvanaBasicBot.TryParseTemplate(template, userNamespace, settingsText, options, true)) {
            headerLastUsed = NirvanaBot.getDefaultHeader();
            footerLastUsed = NirvanaBot.getDefaultFooter();
            middleLastUsed = NirvanaBot.getDefaultMiddle();
            String key = "";
            if (options.containsKey(key) && !options.get(key).isEmpty()) {
                headerLastUsed = options.get(key).replace("\\n", "\n");
            }

            key = "";
            if (options.containsKey(key) && !options.get(key).isEmpty()) {
                footerLastUsed = options.get(key).replace("\\n", "\n");
            }
            key = "";
            if (options.containsKey(key) && !options.get(key).isEmpty()) {
                middleLastUsed = options.get(key).replace("\\n", "\n");
            }

            if (headerLastUsed == null) {
                headerLastUsed = "";
            }

            if (footerLastUsed == null) {
                footerLastUsed = "";
            }

            if (middleLastUsed == null) {
                middleLastUsed = "";
            }
        }
    }

    public void updateArchive(NirvanaWiki wiki, Data d, ReportItem reportData) throws LoginException, IOException {
        if (archiveSettings == null || archiveSettings.isSimple()) {
            log.debug("archive has simple format");
            log.info("Updating " + archive);
            wiki.prependOrCreate(archive, d.archiveText, "+" + d.archiveCount + " ", this.minor, this.bot);
            reportData.archived = true;
            return;
        }

        // we need here a map of archives - Archive objects 

        log.debug("archive has difficult format");

        Archive defaultArchive = null;
        String defaultArhiveName = null;
        if (archiveSettings.isSingle()) {
            defaultArhiveName = archiveSettings.archive;
        } else {
            defaultArhiveName = archiveSettings.getArchiveForDate(Calendar.getInstance());
        }
        defaultArchive = ArchiveFactory.createArchive(archiveSettings, wiki, defaultArhiveName, delimeter);

        HashMap<String, Archive> hmap = null;
        hmap = new HashMap<String, Archive>(3);
        hmap.put(defaultArhiveName, defaultArchive);

        for (int i = d.archiveItems.size() - 1; i >= 0; i--) {
            String item = d.archiveItems.get(i);
            log.debug("archiving item: " + item);
            Calendar c = getNewPagesItemDate(wiki, item);
            if (c == null) {
                defaultArchive.add(item);
                continue;
            }
            // 1) get archive object for this item
            Archive thisArchive = defaultArchive;
            if (!archiveSettings.isSingle()) {
                String arname = archiveSettings.getArchiveForDate(c);
                thisArchive = hmap.get(arname);
                if (thisArchive == null) {
                    thisArchive = ArchiveFactory.createArchive(archiveSettings, wiki, arname, delimeter);
                    hmap.put(arname, thisArchive);
                }
            }
            // find where to put this item
            if (archiveSettings.withoutHeaders()) {
                if (!archiveSettings.hasHtmlEnumeration()) {
                    ((ArchiveSimple) thisArchive).add(item);
                } else {
                    ((ArchiveWithEnumeration) thisArchive).add(item);
                }
            } else {
                String thisHeader = archiveSettings.getHeaderForDate(c);
                String superHeader = archiveSettings.getHeaderHeaderForDate(c);
                if (superHeader == null) {
                    ((ArchiveWithHeaders) thisArchive).add(item, thisHeader);
                } else {
                    ((ArchiveWithHeaders) thisArchive).add(item, thisHeader, superHeader);
                }
            }

        }

        Iterator<Entry<String, Archive>> it = hmap.entrySet().iterator();
        while (it.hasNext()) {
            Entry<String, Archive> ar = it.next();
            Archive thisArchive = ar.getValue();
            if (thisArchive.newItemsCount() > 0) {
                log.info("Updating " + ar.getKey());
                thisArchive.update(wiki, ar.getKey(), minor, bot);
            }
        }
        reportData.archived = true;
        return;
    }

    public static Calendar getNewPagesItemDate(NirvanaWiki wiki, String item) {
        Calendar c = null;
        //NewPagesItem itemData = null;
        Pattern p = Pattern.compile(PATTERN_NEW_PAGES_LINE_WIKIREF_ARTICLE);
        Matcher m = p.matcher(item);
        boolean foundBrackets = false;
        while (m.find()) {
            String article = m.group("article");
            if (article.startsWith(":"))
                article = article.substring(1); // special case when title starts from : this : is not included in title
            if (article.contains("::    ")) {
                // foundBrackets should stay false for this case
                continue;
            }
            Revision r = null;
            try {
                r = wiki.getFirstRevision(article, true);
            } catch (IOException e) {
                //e.printStackTrace();
                log.warn("article " + article + " not found");
            }
            if (r != null) {
                return r.getTimestamp();
            }
            foundBrackets = true;
        }
        if (foundBrackets)
            return null;
        //     [[]] ,    {{}}
        p = Pattern.compile(PATTERN_NEW_PAGES_LINE_TEMPLATE_ITEM);
        m = p.matcher(item);
        if (m.find()) {
            String templateString = m.group("template");
            //log.debug("count = "+String.valueOf(m.groupCount()));

            //if(templateString!=null) System.out.println("found template: "+templateString);
            String[] items = templateString.split("\\|");
            // 2012-02-26T16:10:36Z
            //p = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
            for (String s : items) {
                c = DateTools.parseDate(s);
                if (c != null)
                    return c;
            }
            for (int i = 1; i < items.length; i++) {
                String s = items[i];
                //System.out.println("check string: "+s);
                Revision r = null;
                try {
                    r = wiki.getFirstRevision(s, true);
                    //r = null;
                } catch (IOException e) {
                    log.warn("page " + s + " not found");
                }
                if (r != null)
                    return r.getTimestamp();
            }
        }
        return null;
    }

    public static boolean userNamespace(String article) {
        return (article.startsWith(":") || article.startsWith("_")
                || article.startsWith(" ") || article.startsWith("User:")
                || article.startsWith("User_talk:"));
    }

    public static boolean categoryNamespace(String article) {
        return (article.startsWith(":") || article.startsWith("Category:"));
    }

    // TODO       !!!
    public static String getNewPagesItemArticle(String item) {
        Pattern p = Pattern.compile(PATTERN_NEW_PAGES_LINE_WIKIREF_ARTICLE);
        Matcher m = p.matcher(item);
        while (m.find()) {
            String article = m.group("article");
            if (article.startsWith(":"))
                article = article.substring(1); // special case when title starts from : this : is not included in title
            if (article.contains("::    ")) {
                continue;
            }
            if (!userNamespace(article))
                return article;
        }
        p = Pattern.compile(PATTERN_NEW_PAGES_LINE_TEMPLATE_ITEM);
        m = p.matcher(item);
        if (m.find()) {
            String templateString = m.group("template");
            //log.debug("count = "+String.valueOf(m.groupCount()));

            //if(templateString!=null) System.out.println("found template: "+templateString);
            String[] items = templateString.split("\\|");
            // 2012-02-26T16:10:36Z
            //p = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
            for (int i = 1; i < items.length; i++) {
                String s = items[i].trim();
                String article = s;
                if (!userNamespace(article))
                    return article;
            }
        }
        return null;
    }

    public static String unmarkDeleted(String item) {
        String str = item;
        int pos = str.indexOf(COMMENT_DELETED);
        if (pos >= 0) {
            str = str.substring(0, pos) + ((pos + COMMENT_DELETED.length()) < str.length()
                    ? str.substring(pos + COMMENT_DELETED.length())
                    : "");
            return str;
        }
        Pattern p = Pattern.compile(".*\\{\\{( | 2)\\|.+\\|.+\\|.+\\|\\}\\}.*");
        Matcher m = p.matcher(item);
        if (m.matches()) {
            pos = str.indexOf("|}}");
            if (pos >= 0) {
                str = str.substring(0, pos) + str.substring(pos + 3);
            }
        }
        return str;
    }

    public static String markDeleted(String item) {
        String str = item;
        if (str.contains("''' '''") || str.contains("|}}"))
            return str;
        Pattern p = Pattern.compile(".*\\[\\[(.+)\\]\\].*");
        Matcher m = p.matcher(item);
        if (m.matches()) {
            str = str + COMMENT_DELETED;
        }
        p = Pattern.compile(".*\\{\\{( | 2)\\|.+\\|.+\\|.+\\|\\}\\}.*");
        m = p.matcher(item);
        if (m.matches() && str.contains("{{ ")) {
            int pos = str.indexOf("}}");
            str = str.substring(0, pos) + "" + str.substring(pos);
        } else {
            p = Pattern.compile(".*\\{\\{( | 2)\\|.+\\|.+\\|.+\\}\\}.*");
            m = p.matcher(item);
            if (m.matches() && str.contains("{{ ")) {
                int pos = str.indexOf("}}");
                str = str.substring(0, pos) + "|" + str.substring(pos);
            }
        }
        return str;
    }

    public static String pageTitleNormalToEscaped(String str) {
        return str.replace("=", "&#61;");
    }

    public static String pageTitleEscapedToNormal(String str) {
        return str.replace("&#61;", "=");
    }
}