Java tutorial
/** * @(#)PageListProcessorSlow.java 14.12.2014 * Copyright 2013 - 2014 Dmitry Trofimovich (KIN)(DimaTrofimovich@gmail.com) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /** * WARNING: This file may contain Russian characters. * Recommended code page for this file is CP1251 (also called Windows-1251). * */ package org.wikipedia.nirvana.nirvanabot.pagesfetcher; import java.io.BufferedReader; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.wikipedia.Wiki; import org.wikipedia.Wiki.Revision; import org.wikipedia.nirvana.NirvanaWiki; import org.wikipedia.nirvana.ServiceError; import org.wikipedia.nirvana.StringTools; import org.wikipedia.nirvana.WikiTools; import org.wikipedia.nirvana.nirvanabot.NirvanaBot; /** * @author kin * */ public class PageListProcessorSlow extends BasicProcessor { protected Map<String, String> pageLists; protected Map<String, String> pageListsToIgnore; /** * @param cats * @param ignore * @param lang * @param depth * @param hours * @param namespace */ public PageListProcessorSlow(WikiTools.Service service, List<String> cats, List<String> ignore, String lang, int depth, int namespace, PageListFetcher fetcher) { super(service, cats, ignore, lang, depth, namespace, fetcher); } /** * */ @Override public ArrayList<Revision> getNewPages(NirvanaWiki wiki) throws IOException, InterruptedException, ServiceError { ArrayList<Revision> pageInfoList = new ArrayList<Revision>(30); HashSet<String> pages = new HashSet<String>(); getData(wiki); HashSet<String> ignore = getIgnorePages(wiki, null); for (String category : categories) { log.info("Processing data of " + category); String pageList = pageLists.get(category); parsePageList(wiki, pages, pageInfoList, ignore, pageList); } return pageInfoList; } public void getData(Wiki wiki) throws IOException, InterruptedException { //log.info("Getting data for [[" + this.pageName+"]]"); pageLists = getNewPagesForCategories(categories); pageListsToIgnore = getNewPagesForCategories(categoriesToIgnore); } private Map<String, String> getNewPagesForCategories(List<String> categoriList) throws IOException, InterruptedException { Map<String, String> result = new HashMap<String, String>(); for (String category : categoriList) { Pair<Integer, String> pair = PageListProcessorSlow.extractDepthFromCat(category); String text = fetcher.loadNewPagesForCat(service, pair.getRight(), language, (pair.getLeft() >= 0) ? pair.getLeft() : depth, namespace); result.put(category, text); } return result; } protected static Pair<Integer, String> extractDepthFromCat(String category) { int depth = -1; int depthIndex = category.indexOf(NirvanaBot.DEPTH_SEPARATOR); if (depthIndex > 0) { try { depth = Integer.parseInt(category.substring(depthIndex + NirvanaBot.DEPTH_SEPARATOR.length())); } catch (NumberFormatException e) { // ignore } category = category.substring(0, depthIndex); } return new ImmutablePair<Integer, String>(depth, category); } public HashSet<String> getIgnorePages(NirvanaWiki wiki, HashSet<String> ignorePages) throws IOException, ServiceError { HashSet<String> ignore = ignorePages; if (ignore == null) ignore = new HashSet<String>(); for (String category : categoriesToIgnore) { //log.info("Processing data of " + category); String line; String pageList = this.pageListsToIgnore.get(category); if (pageList.startsWith("ERROR : MYSQL error")) { log.error("Invalid service output: " + StringTools.trancateTo(pageList, 100)); throw new ServiceError("Invalid output of service: " + service.getName()); } BufferedReader br = new BufferedReader(new StringReader(pageList)); for (int j = 0; j < service.SKIP_LINES; j++) br.readLine(); Pattern p = Pattern.compile(LINE_RULE); int j = 0; while ((line = br.readLine()) != null) { j++; if (line.isEmpty()) continue; if (j < LINES_TO_CHECK && !p.matcher(line).matches()) { log.error("Invalid service output line: " + line); throw new ServiceError("Invalid output of service: " + service.getName()); } String[] groups = line.split("\t"); if (groups[service.NS_POS].equals(String.valueOf(this.namespace))) { String title = groups[service.TITLE_POS].replace('_', ' '); ignore.add(title); } else if (groups[service.NS_POS].equals(String.valueOf(Wiki.USER_NAMESPACE)) && namespace != Wiki.USER_NAMESPACE) { long revId = 0; if (service.REVID_POS >= 0) { try { revId = Long.parseLong(groups[service.REVID_POS]); } catch (NumberFormatException e) { log.error(e.toString()); continue; } Revision r = wiki.getRevision(revId); String title = r.getPage(); ignore.add(title); } } } } return ignore; } /* (non-Javadoc) * @see org.wikipedia.nirvana.nirvanabot.pagesfetcher.PageListProcessor#mayHaveDuplicates() */ @Override public boolean mayHaveDuplicates() { return false; } }