Java tutorial
/** * Copyright 2013 Diego Ceccarelli * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.cnr.isti.hpc.wikipedia.parser; import it.cnr.isti.hpc.wikipedia.article.Article; import it.cnr.isti.hpc.wikipedia.article.Article.Type; import it.cnr.isti.hpc.wikipedia.article.Language; import it.cnr.isti.hpc.wikipedia.article.Link; import it.cnr.isti.hpc.wikipedia.article.Table; import it.cnr.isti.hpc.wikipedia.article.Template; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.tudarmstadt.ukp.wikipedia.parser.Content; import de.tudarmstadt.ukp.wikipedia.parser.ContentElement; import de.tudarmstadt.ukp.wikipedia.parser.DefinitionList; import de.tudarmstadt.ukp.wikipedia.parser.NestedList; import de.tudarmstadt.ukp.wikipedia.parser.NestedListContainer; import de.tudarmstadt.ukp.wikipedia.parser.Paragraph; import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage; import de.tudarmstadt.ukp.wikipedia.parser.Section; import de.tudarmstadt.ukp.wikipedia.parser.Span; import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; /** * Generates a Mediawiki parser given a language, (it will expect to find a * locale file in <tt>src/main/resources/</tt>). * * @see Locale * * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it> * * Created on Feb 14, 2013 */ public class ArticleParser { static MediaWikiParserFactory parserFactory = new MediaWikiParserFactory(); private static final Logger logger = LoggerFactory.getLogger(ArticleParser.class); /** the language (used for the locale) default is English **/ private String lang = Language.EN; static int shortDescriptionLength = 500; private List<String> redirects; private MediaWikiParser parser; private Locale locale; public ArticleParser(String lang) { this.lang = lang; parser = parserFactory.getParser(lang); locale = new Locale(lang); redirects = locale.getRedirectIdentifiers(); } public ArticleParser() { parser = parserFactory.getParser(lang); locale = new Locale(lang); redirects = locale.getRedirectIdentifiers(); } public void parse(Article article, String mediawiki) { ParsedPage page = parser.parse(mediawiki); setRedirect(article, mediawiki); parse(article, page); } private void parse(Article article, ParsedPage page) { article.setLang(lang); setWikiTitle(article); if (page == null) { logger.warn("page is null for article {}", article.getTitle()); } else { setParagraphs(article, page); // setShortDescription(article); setTemplates(article, page); setLinks(article, page); setCategories(article, page); setHighlights(article, page); setSections(article, page); setTables(article, page); setEnWikiTitle(article, page); setLists(article, page); } setRedirect(article); setDisambiguation(article); setIsList(article); } // /** // * @param article // */ // private void setShortDescription(Article article) { // StringBuilder sb = new StringBuilder(); // for (String paragraph : article.getParagraphs()) { // paragraph = removeTemplates(paragraph); // sb.append(paragraph); // if (sb.length() > shortDescriptionLength) { // break; // } // } // if (sb.length() > shortDescriptionLength) { // sb.setLength(shortDescriptionLength); // int pos = sb.lastIndexOf(" "); // sb.setLength(pos); // } // article.setShortDescription(sb.toString()); // // } // private final static String templatePattern = "TEMPLATE\\[[^]]+\\]"; // // private static String removeTemplates(String paragraph) { // paragraph = paragraph.replaceAll(templatePattern, " "); // // return paragraph; // } /** * @param article */ private void setWikiTitle(Article article) { article.setWikiTitle(Article.getTitleInWikistyle(article.getTitle())); } /** * @param article */ private void setIsList(Article article) { for (String list : locale.getListIdentifiers()) { if (StringUtils.startsWithIgnoreCase(article.getTitle(), list)) { article.setType(Type.LIST); } } } private void setRedirect(Article article) { if (!article.getRedirect().isEmpty()) return; List<List<String>> lists = article.getLists(); if ((!lists.isEmpty()) && (!lists.get(0).isEmpty())) { // checking only first item in first list String line = lists.get(0).get(0); for (String redirect : redirects) { if (StringUtils.startsWithIgnoreCase(line, redirect)) { int pos = line.indexOf(' '); if (pos < 0) return; String red = line.substring(pos).trim(); red = Article.getTitleInWikistyle(red); article.setRedirect(red); article.setType(Type.REDIRECT); return; } } } } // for (List<String> lists : article.getLists()) { // for (String line : lists) { // for (String redirect : redirects) { // if (StringUtils.startsWithIgnoreCase(line, redirect)) { // int pos = line.indexOf(' '); // if (pos < 0) // return; // String red = line.substring(pos).trim(); // red = Article.getTitleInWikistyle(red); // article.setRedirect(red); // article.setType(Type.REDIRECT); // return; // // } // } // } // } /** * @param article * @param page */ private void setRedirect(Article article, String mediawiki) { for (String redirect : redirects) if (StringUtils.startsWithIgnoreCase(mediawiki, redirect)) { int start = mediawiki.indexOf("[[") + 2; int end = mediawiki.indexOf("]]"); if (start < 0 || end < 0) { logger.warn("cannot find the redirect {}\n mediawiki: {}", article.getTitle(), mediawiki); continue; } String r = Article.getTitleInWikistyle(mediawiki.substring(start, end)); article.setRedirect(r); article.setType(Type.REDIRECT); } } /** * @param page */ private void setTables(Article article, ParsedPage page) { List<Table> tables = new ArrayList<Table>(); for (de.tudarmstadt.ukp.wikipedia.parser.Table t : page.getTables()) { // System.out.println(t); int i = 0; String title = ""; if (t.getTitleElement() != null) { title = t.getTitleElement().getText(); if (title == null) title = ""; } Table table = new Table(title); List<String> currentRow = new ArrayList<String>(); List<Content> contentList = t.getContentList(); for (@SuppressWarnings("unused") Content c : contentList) { int row, col; String elem = ""; try { col = t.getTableElement(i).getCol(); row = t.getTableElement(i).getRow(); elem = t.getTableElement(i).getText(); } catch (IndexOutOfBoundsException e) { // logger.( // "Error creating table {}, Index out of bound - content = {}", // table.getName(), c.getText()); break; } if (row > 0 && col == 0) { if ((currentRow.size() == 1) && (currentRow.get(0).equals(table.getName()))) { currentRow = new ArrayList<String>(); } else { if (!currentRow.isEmpty()) table.addRow(currentRow); currentRow = new ArrayList<String>(); } } currentRow.add(elem); i++; } table.addRow(currentRow); tables.add(table); } article.setTables(tables); } protected void setEnWikiTitle(Article article, ParsedPage page) { if (article.isLang(Language.EN)) { return; } try { if (page.getLanguages() == null) { article.setEnWikiTitle(""); return; } } catch (NullPointerException e) { // FIXME title is always null! logger.warn("no languages for page {} ", article.getTitle()); return; } for (de.tudarmstadt.ukp.wikipedia.parser.Link l : page.getLanguages()) if (l.getText().startsWith("en:")) { article.setEnWikiTitle(l.getTarget().substring(3)); break; } } /** * @param page */ private void setSections(Article article, ParsedPage page) { List<String> sections = new ArrayList<String>(10); for (Section s : page.getSections()) { if (s == null || s.getTitle() == null) continue; sections.add(s.getTitle()); } article.setSections(sections); } private void setLinks(Article article, ParsedPage page) { List<Link> links = new ArrayList<Link>(10); List<Link> elinks = new ArrayList<Link>(10); for (de.tudarmstadt.ukp.wikipedia.parser.Link t : page.getLinks()) { if (t.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.INTERNAL) { links.add(new Link(t.getTarget(), t.getText())); } if (t.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.EXTERNAL) { elinks.add(new Link(t.getTarget(), t.getText())); } } article.setLinks(links); article.setExternalLinks(elinks); } private void setTemplates(Article article, ParsedPage page) { List<Template> templates = new ArrayList<Template>(10); for (de.tudarmstadt.ukp.wikipedia.parser.Template t : page.getTemplates()) { List<String> templateParameters = t.getParameters(); parseTemplatesSchema(article, templateParameters); if (t.getName().toLowerCase().startsWith("infobox")) { article.setInfobox(new Template(t.getName(), templateParameters)); } else { templates.add(new Template(t.getName(), templateParameters)); } } article.setTemplates(templates); } /** * * @param templateParameters */ private void parseTemplatesSchema(Article article, List<String> templateParameters) { List<String> schema = new ArrayList<String>(10); for (String s : templateParameters) { try { if (s.contains("=")) { String attributeName = s.split("=")[0].trim().toLowerCase(); schema.add(attributeName); } } catch (Exception e) { continue; } } article.addTemplatesSchema(schema); } private void setCategories(Article article, ParsedPage page) { ArrayList<Link> categories = new ArrayList<Link>(10); for (de.tudarmstadt.ukp.wikipedia.parser.Link c : page.getCategories()) { categories.add(new Link(c.getTarget(), c.getText())); } article.setCategories(categories); } private void setHighlights(Article article, ParsedPage page) { List<String> highlights = new ArrayList<String>(20); for (Paragraph p : page.getParagraphs()) { for (Span t : p.getFormatSpans(Content.FormatType.BOLD)) { highlights.add(t.getText(p.getText())); } for (Span t : p.getFormatSpans(Content.FormatType.ITALIC)) { highlights.add(t.getText(p.getText())); } } article.setHighlights(highlights); } private void setParagraphs(Article article, ParsedPage page) { List<String> paragraphs = new ArrayList<String>(page.nrOfParagraphs()); for (Paragraph p : page.getParagraphs()) { String text = p.getText(); // text = removeTemplates(text); text = text.replace("\n", " ").trim(); if (!text.isEmpty()) paragraphs.add(text); } article.setParagraphs(paragraphs); } private void setLists(Article article, ParsedPage page) { List<List<String>> lists = new LinkedList<List<String>>(); for (DefinitionList dl : page.getDefinitionLists()) { List<String> l = new ArrayList<String>(); for (ContentElement c : dl.getDefinitions()) { l.add(c.getText()); } lists.add(l); } for (NestedListContainer dl : page.getNestedLists()) { List<String> l = new ArrayList<String>(); for (NestedList nl : dl.getNestedLists()) l.add(nl.getText()); lists.add(l); } article.setLists(lists); } private void setDisambiguation(Article a) { for (String disambiguation : locale.getDisambigutionIdentifiers()) { if (StringUtils.containsIgnoreCase(a.getTitle(), disambiguation)) { a.setType(Type.DISAMBIGUATION); return; } for (Template t : a.getTemplates()) { if (StringUtils.equalsIgnoreCase(t.getName(), disambiguation)) { a.setType(Type.DISAMBIGUATION); return; } } } } }