com.flaptor.hounder.indexer.SanitizerModule.java Source code

Introduction

Here is the source code for com.flaptor.hounder.indexer.SanitizerModule.java
Source

/*
Copyright 2008 Flaptor (flaptor.com) 
    
Licensed under the Apache License, Version 2.0 (the "License"); 
you may not use this file except in compliance with the License. 
You may obtain a copy of the License at 
    
http://www.apache.org/licenses/LICENSE-2.0 
    
Unless required by applicable law or agreed to in writing, software 
distributed under the License is distributed on an "AS IS" BASIS, 
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
See the License for the specific language governing permissions and 
limitations under the License.
*/
package com.flaptor.hounder.indexer;

import com.flaptor.util.Config;
import com.flaptor.util.DomUtil;
import com.flaptor.util.Execute;

import org.apache.log4j.Logger;

import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;

import java.util.Arrays;
import java.util.HashSet;

/**
 * This class implements a module that captures commands and executes them.
 * The commands have the following format:
 * <pre>
 *   &lt; command name=(string) / &gt;
 * </pre>
 * where the command name may be "optimize" to schedule an index optimization;
 * , "close" to close the app cleanly or "checkpoint" to flush the index to disk.
 * 
 * @author Flaptor Development Team
 */
public class SanitizerModule extends AModule {
    private static final Logger logger = Logger.getLogger(Execute.whoAmI());
    private static com.flaptor.util.parser.HtmlParser htmlParser = new com.flaptor.util.parser.HtmlParser();
    private HashSet<String> htmlFields, xmlFields, accentFields, allFields;
    private String xpath;

    /**
     * Constructor.
     * @param indexer a reference to the indexer that contains this module.
     */
    public SanitizerModule() {
        Config config = Config.getConfig("indexer.properties");
        xpath = config.getString("SanitizerModule.XPath");
        htmlFields = new HashSet<String>(Arrays.asList(config.getStringArray("SanitizerModule.html")));
        xmlFields = new HashSet<String>(Arrays.asList(config.getStringArray("SanitizerModule.xml")));
        accentFields = new HashSet<String>(Arrays.asList(config.getStringArray("SanitizerModule.accents")));
        allFields = new HashSet<String>();
        allFields.addAll(htmlFields);
        allFields.addAll(xmlFields);
        allFields.addAll(accentFields);
    }

    /**
     * @param doc a non null document to process
     */
    public Document[] internalProcess(final Document doc) {
        Element root = doc.getRootElement();
        if (null != root) {
            for (String name : allFields) {
                Element elem = (Element) root.selectSingleNode(xpath.replace("$", name));
                if (null != elem) {
                    try {
                        String text = elem.getText();

                        if (htmlFields.contains(name)) {
                            text = htmlParser.parse("internal document", text.getBytes("UTF-8"), "UTF-8").getText();
                        }

                        if (xmlFields.contains(name)) {
                            text = DomUtil.filterXml(text);
                        }

                        if (accentFields.contains(name)) {
                            text = filterAccents(text);
                        }

                        elem.setText(text);
                    } catch (Exception e) {
                        logger.warn("Sanitizing field " + name, e);
                    }
                }
            }
        }
        Document[] docs = { doc };
        return docs;
    }

private String filterAccents(String text) {
    StringBuffer buf = new StringBuffer();
    text = text.toLowerCase();
    for (int i=0; i<text.length(); i++) {
        char c = text.charAt(i);
        switch (c) {
            case '':
            case '':
            case '':
            case '':
                c = 'a';
                break;
            case '':
            case '':
            case '':
            case '':
                c = 'e';
                break;
            case '':
            case '':
            case '':
            case '':
                c = 'i';
                break;
            case '':
            case '':
            case '':
            case '':
                c = 'o';
                break;
            case '':
            case '':
            case '':
            case '':
                c = 'u';
                break;
            case '':
                c = 'n';
                break;
            case '':
                c = 'c';
                break;
        }
        buf.append(c);
    }
    return buf.toString();
}

    public static void main(String[] args) {
        String text = args[0];
        Document doc = DocumentHelper.createDocument();
        Element root = doc.addElement("documentAdd");
        root.addElement("text").addText(text);
        root.addElement("field").addAttribute("name", "text").addAttribute("indexed", "true")
                .addAttribute("stored", "true").addAttribute("tokenized", "true").addText(text);
        SanitizerModule mod = new SanitizerModule();
        Document[] docs = mod.internalProcess(doc);
        for (Document d : docs) {
            System.out.println(DomUtil.domToString(d));
        }
    }
}