de.tudarmstadt.ukp.wikipedia.parser.mediawiki.FrenchTemplateParser.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.wikipedia.parser.mediawiki.FrenchTemplateParser.java

Source

/*******************************************************************************
 * Copyright (c) 2010 Torsten Zesch.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Lesser Public License v3
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/lgpl.html
 * 
 * Contributors:
 *     Torsten Zesch - initial API and implementation
 ******************************************************************************/
package de.tudarmstadt.ukp.wikipedia.parser.mediawiki;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Scanner;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import de.tudarmstadt.ukp.wikipedia.parser.Link;
import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.parser.Template;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiContentElementParser;

/**
 * This is the TemplateParser for the french language, with special treatment
 * for all the french templates, like "-s (sicle)" and others.
 * 
 * @author CJacobi
 * 
 */
public class FrenchTemplateParser implements MediaWikiTemplateParser {

    private final Log logger = LogFactory.getLog(getClass());

    private final String templatePrefix = "TEMPLATE[";
    private final String templatePostfix = "]";
    private final String parameterDivisor = ", ";
    private final String templateNotImplementedPrefix = "TEMPLATE NOT IMPLEMENTED[";
    private final String templateNotImplementedPostfix = "]";
    private final String emptyLinkText = "[ ]";

    // private MediaWikiContentElementParser parser;
    private List<String> deleteTemplates;
    private List<String> parseTemplates;
    private HashMap<String, String> parsing;

    public FrenchTemplateParser(MediaWikiContentElementParser parser, List<String> deleteTemplates,
            List<String> parseTemplates) {
        this.deleteTemplates = deleteTemplates;
        this.parseTemplates = parseTemplates;

        // this.parser = parser;

        /**
         * Adding code
         */
        String base = "./configuration/MediaWikiParsing/";
        try {
            parsing = getParseText(base + "parser_templates.txt", false, true);
        } catch (FileNotFoundException e) {
            System.err.println("Erreur lors de la lecture de parser_templates.fr");
            e.printStackTrace();
        }
    }

    public String configurationInfo() {
        StringBuilder result = new StringBuilder();
        result.append("Standard Template treatment: ShowNameAndParameters");
        result.append("\nDelete Templates: ");
        for (String s : deleteTemplates) {
            result.append("\"" + s + "\" ");
        }
        result.append("\nParse Templates: ");
        for (String s : parseTemplates) {
            result.append("\"" + s + "\" ");
        }
        return result.toString();
    }

    public ResolvedTemplate parseTemplate(Template t, ParsedPage pp) {

        final String templateName = t.getName();
        // Show Name and Parameters as Standart treatment.
        ResolvedTemplate result = new ResolvedTemplate(t);
        result.setPreParseReplacement(ResolvedTemplate.TEMPLATESPACER);
        StringBuilder sb = new StringBuilder();
        sb.append(templatePrefix);
        sb.append(t.getName() + parameterDivisor);
        for (String s : t.getParameters()) {
            sb.append(s + parameterDivisor);
        }
        sb.delete(sb.length() - parameterDivisor.length(), sb.length());
        sb.append(templatePostfix);
        result.setPostParseReplacement(sb.toString());

        result.setParsedObject(t);

        /**
         * 1 DeleteTempaltes : On enlve tout
         */
        for (String s : deleteTemplates) {
            s = s.toLowerCase();
            String templateName_lower = templateName.toLowerCase();

            if (templateName_lower.matches(s)) {
                result.setPostParseReplacement("");
                result.setParsedObject(null);
                return result;
            }
        }

        /**
         * 2. Parse Template
         */
        //      System.out.println(templateName);
        String templateName_lower = templateName.toLowerCase();

        for (String s : parseTemplates) {
            s = s.toLowerCase();
            if (templateName_lower.matches(s)) {

                List<String> templateParameters = t.getParameters();

                /**
                 * Template to Parse French
                 */
                if (parsing.containsKey(templateName)) {
                    String value = parsing.get(templateName);
                    //               System.out.println("TemplateName:"+templateName);
                    //               System.out.println("TemplateValue:"+value);

                    String[] args = value.split(" ");

                    String remplacement = "";

                    for (int i = 0; i < args.length; i++) {
                        String currArg = args[i];

                        if (currArg.equals("all")) {
                            // On prend ce qui est crit dans le texte
                            for (int j = 0; j < templateParameters.size(); j++) {
                                remplacement += templateParameters.get(j) + " ";
                            }
                            if (remplacement.equals("")) {
                                remplacement = templateName;
                            }
                            //                     System.out.println("Remplacement:"+remplacement);
                        }
                        // simple index (e.g: TEMPLATE 1 2 4  5)
                        else if (currArg.matches("[0-9]+")) {
                            int index_params = Integer.parseInt(currArg);

                            if (index_params < templateParameters.size()) {
                                remplacement += templateParameters.get(index_params) + " ";
                            }
                        } else {

                            // On prend ce qui est crit dans parser_templates.txt
                            remplacement += currArg + " ";
                        }

                    }

                    //               System.out.println(templateName_lower);
                    //               System.out.println("Remplacement:"+remplacement);
                    //               System.out.println();

                    result.setPostParseReplacement(remplacement);
                    result.setParsedObject(null);
                    return result;
                }
            }
        }

        /**
         * 3. Non delete and non pars
         */

        //      System.out.println("#########"+templateName);
        String remplacement = "";
        List<String> templateParameters = t.getParameters();
        for (int j = 0; j < templateParameters.size(); j++) {
            remplacement += templateParameters.get(j) + " ";
        }
        if (remplacement.equals("")) {
            remplacement = templateName;
        }
        result.setPostParseReplacement(remplacement);
        result.setParsedObject(null);
        return result;

        //      /**
        //       * 2 ParseTemplate
        //       */
        //
        //      for (String s : parseTemplates) {
        //
        //         s = s.toLowerCase();
        //         String templateName_lower = templateName.toLowerCase();
        //         
        //         List<String> templateParameters = t.getParameters();
        //
        //         if (s.equals(templateName_lower)) {
        //            logger.info("ParseTemplate: " + templateName);
        //
        //            /**
        //             * Template to Parse French
        //             */
        //            if(parsing.containsKey(templateName)){
        //               String value = parsing.get(templateName);
        ////               System.out.println("TemplateName:"+templateName);
        ////               System.out.println("TemplateValue:"+value);
        //               
        //               String [] args = value.split(" ");
        //               
        //               String remplacement = "";
        //               
        //               
        //               for(int i=0;i< args.length;i++){
        //                  String currArg = args[i];
        //                  
        //                  
        //                  if(currArg.equals("all")){
        //                     // On prend ce qui est crit dans le texte
        //                     for(int j=0;j<templateParameters.size();j++){
        //                        remplacement += templateParameters.get(j) + " ";
        //                     }
        //                     if(remplacement.equals("")){
        //                        remplacement = templateName;
        //                     }
        ////                     System.out.println("Remplacement:"+remplacement);
        //                  }
        //                  // simple index (e.g: TEMPLATE 1 2 4  5)
        //                  else if(currArg.matches("[0-9]+")){
        //                     int index_params = Integer.parseInt(currArg);
        //                     
        //                     if(index_params < templateParameters.size()){
        //                        remplacement += templateParameters.get(index_params) + " ";
        //                     }
        //                  }
        //                  else if(currArg.matches("[a-z]*")){
        //                     
        //                     // On prend ce qui est crit dans parser_templates.txt
        //                     remplacement += currArg + " ";
        //                  }
        //               
        //               }
        //               
        //               result.setPostParseReplacement(remplacement);
        //               result.setParsedObject(null);
        //               return result;
        //            }
        ////            if (templateName.equals("lang")) {
        ////
        ////               result.setPostParseReplacement(templateParameters.get(1));
        ////               result.setParsedObject(null);
        ////               return result;
        ////
        ////            } else if (templateName.equals("s-")) {
        ////               result.setPostParseReplacement(templateParameters.get(0)
        ////                     + " sicle");
        ////               result.setParsedObject(null);
        ////               return result;
        ////            } 
        //            
        ////            else if (templateName.equals("Xe sicle")) {
        ////               result.setPostParseReplacement("Xe sicle");
        ////               result.setParsedObject(null);
        ////               return result;
        ////            } else if (templateName.equals("XIe sicle")) {
        ////               result.setPostParseReplacement("XIe sicle");
        ////               result.setParsedObject(null);
        ////               return result;
        ////            } else if (templateName.equals("XIIe_sicle")) {
        ////               result.setPostParseReplacement("XIIe sicle");
        ////               result.setParsedObject(null);
        ////               return result;
        ////            } else if (templateName.equals("VXIIIe_sicle")) {
        ////               result.setPostParseReplacement("XIIIe sicle");
        ////               result.setParsedObject(null);
        ////               return result;
        ////            } else if (templateName.equals("XIVe_sicle")) {
        ////               result.setPostParseReplacement("XIVe sicle");
        ////               result.setParsedObject(null);
        ////               return result;
        ////            } else if (templateName.equals("XVe_sicle")) {
        ////               result.setPostParseReplacement("XVe sicle");
        ////               result.setParsedObject(null);
        ////               return result;
        ////            } else if (templateName.equals("XVIe_sicle")) {
        ////               result.setPostParseReplacement("XVI sicle");
        ////               result.setParsedObject(null);
        ////               return result;
        ////            } else if (templateName.equals("XVIIe_sicle")) {
        ////               result.setPostParseReplacement("XVIIe sicle");
        ////               result.setParsedObject(null);
        ////               return result;
        ////            } else if (templateName.equals("XVIIIe_sicle")) {
        ////               result.setPostParseReplacement("XVIIIe sicle");
        ////               result.setParsedObject(null);
        ////               return result;
        ////            } else if (templateName.equals("XIXe_sicle")) {
        ////               result.setPostParseReplacement("XIX sicle");
        ////               result.setParsedObject(null);
        ////               return result;
        ////            } else if (templateName.equals("XXe_sicle")) {
        ////               result.setPostParseReplacement("XX sicle");
        ////               result.setParsedObject(null);
        ////               return result;
        ////            }
        //         }
        //         
        ////         else{
        ////            System.out.println("NO MATCH:"+s);
        ////            System.out.println();
        ////            
        ////            String remplacement = "";
        ////            for(int j=0;j<templateParameters.size();j++){
        ////               remplacement += templateParameters.get(j) + " ";
        ////            }
        ////            if(remplacement.equals("")){
        ////               remplacement = templateName;
        ////            }
        ////            
        ////            result.setPostParseReplacement(remplacement);
        ////            result.setParsedObject(null);
        ////            return result;
        ////         }
        //      }
    }

    public HashMap<String, String> getParseText(String path, boolean print, boolean normalize)
            throws FileNotFoundException {

        HashMap<String, String> parsing = new HashMap<String, String>();

        String filePath = path;
        Scanner scanner = new Scanner(new File(filePath));

        // On boucle sur chaque champ detect
        while (scanner.hasNextLine()) {
            String line = scanner.nextLine();
            line = line.trim();

            if (!line.startsWith("#") && line.length() > 0) {

                String name = line.replaceAll("\\[(.*)\\]", "");
                String value = line.substring(name.length());
                value = value.replaceAll("\\[", "");
                value = value.replaceAll("\\]", "");

                if (print) {
                    System.out.println(name + "\t" + value);
                }
                parsing.put(name, value);

                if (normalize) {
                    if (print) {
                        System.out.println(name.toUpperCase() + "\t" + value);
                        System.out.println(name.toLowerCase() + "\t" + value);
                    }
                    parsing.put(name.toUpperCase(), value);
                    parsing.put(name.toLowerCase(), value);
                }
            }
            // faites ici votre traitement
        }

        scanner.close();

        return parsing;
    }
}