lius.index.xml.XmlFileIndexer.java Source code

Java tutorial

Introduction

Here is the source code for lius.index.xml.XmlFileIndexer.java

Source

package lius.index.xml;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import lius.config.LiusField;
import lius.index.Indexer;
import lius.util.LiusUtils;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.log4j.Logger;
import org.jaxen.JaxenException;
import org.jaxen.SimpleNamespaceContext;
import org.jaxen.jdom.JDOMXPath;
import org.jdom.Attribute;
import org.jdom.CDATA;
import org.jdom.Comment;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.EntityRef;
import org.jdom.Namespace;
import org.jdom.ProcessingInstruction;
import org.jdom.Text;

/**
 * Classe se basant sur JDOM et XPATH pour indexer des fichiers XML. <br/><br/>
 * Class based on JDOM and XPATH for indexing XML files.
 *
 * @author Rida Benjelloun (ridabenjelloun@gmail.com)
 */

public class XmlFileIndexer extends Indexer {

    private SimpleNamespaceContext nsc = new SimpleNamespaceContext();

    static Logger logger = Logger.getRootLogger();

    public int getType() {
        return 1;
    }

    public boolean isConfigured() {
        boolean ef = false;
        if (getLiusConfig().getXmlFileFields() != null)
            return ef = true;
        return ef;
    }

    public Collection getConfigurationFields() {
        return getLiusConfig().getXmlFileFields();
    }

    public String getContent() {
        return concatOccurance(LiusUtils.parse(getStreamToIndex()), "//*", "");
    }

    /**
     *
     * Mthode permettant de concatner les occurences multiples d'un lment
     * qui
     *
     * vont tre stockes dans le mme document Lucene.
     *
     * <br/><br/>
     *
     * Method that concatenates multiple hist of an element which will be saved
     * in
     *
     * the same Lucene document.
     *
     */

    public String concatOccurance(Object xmlDoc, String xpath, String concatSep) {

        StringBuffer chaineConcat = new StringBuffer();
        try {
            JDOMXPath xp = new JDOMXPath(xpath);
            xp.setNamespaceContext(nsc);
            List ls = xp.selectNodes(xmlDoc);
            Iterator i = ls.iterator();
            int j = 0;
            while (i.hasNext()) {
                j++;
                String text = "";
                Object obj = (Object) i.next();
                if (obj instanceof Element) {
                    Element elem = (Element) obj;
                    text = elem.getText().trim();
                } else if (obj instanceof Attribute) {
                    Attribute att = (Attribute) obj;
                    text = att.getValue().trim();
                } else if (obj instanceof Text) {
                    Text txt = (Text) obj;
                    text = txt.getText().trim();
                } else if (obj instanceof CDATA) {
                    CDATA cdata = (CDATA) obj;
                    text = cdata.getText().trim();
                } else if (obj instanceof Comment) {
                    Comment com = (Comment) obj;
                    text = com.getText().trim();
                } else if (obj instanceof ProcessingInstruction) {
                    ProcessingInstruction pi = (ProcessingInstruction) obj;
                    text = pi.getData().trim();
                } else if (obj instanceof EntityRef) {
                    EntityRef er = (EntityRef) obj;
                    text = er.toString().trim();
                }
                if (text != "") {
                    if (ls.size() == 1) {
                        chaineConcat.append(text);
                        return chaineConcat.toString().trim();
                    } else {
                        if (ls.size() == j)
                            chaineConcat.append(text);
                        else
                            chaineConcat.append(text + " " + concatSep + " ");
                    }
                }
            }
        } catch (JaxenException j) {
            logger.error(j.getMessage());
        }
        return chaineConcat.toString().trim();
    }

    /**
     * Retourne une collection contenant les champs avec les valeurs  indexer
     * comme par exemple: le texte integral, titre etc. <br/><br/> Returns a
     * collection containing the fields with the values to index; like : full
     * text, title, etc.
     */

    public Collection getPopulatedLiusFields() {
        Document xmlDoc = LiusUtils.parse(getStreamToIndex());
        return getPopulatedLiusFields(xmlDoc, getLiusConfig().getXmlFileFields());

    }

    public Collection getPopulatedLiusFields(Object xml, Collection liusXmlFields) {
        List documentNs = null;
        Map hm = null;
        boolean nsTrouve = false;
        boolean isMap = false;
        Collection resColl = new ArrayList();
        if (xml instanceof org.jdom.Document) {
            documentNs = getAllDocumentNs((org.jdom.Document) xml);
        }
        Iterator itColl = liusXmlFields.iterator();
        while (itColl.hasNext()) {
            Object colElem = itColl.next();
            if (colElem instanceof Map) {
                isMap = true;
                hm = (Map) colElem;
                for (int j = 0; j < documentNs.size(); j++) {
                    Collection liusFields = (Collection) hm.get(documentNs.get(j));
                    if (liusFields != null) {
                        nsTrouve = true;
                        extractDataFromElements(xml, liusFields, resColl);
                    }
                }
            }
            if (nsTrouve == false && (colElem instanceof Map)) {
                extractDataFromElements(xml, (Collection) hm.get("default"), resColl);
            }
        }
        if (isMap == false)
            extractDataFromElements(xml, liusXmlFields, resColl);
        return resColl;
    }

    private void extractDataFromElements(Object xmlDoc, Collection liusFields, Collection resColl) {
        Iterator it = liusFields.iterator();
        while (it.hasNext()) {
            Object field = it.next();
            if (field instanceof LiusField) {
                LiusField lf = (LiusField) field;
                if (lf.getOcurSep() != null) {
                    String cont = concatOccurance(xmlDoc, lf.getXpathSelect(), lf.getOcurSep());
                    lf.setValue(cont);
                    resColl.add(lf);
                } else {
                    try {
                        JDOMXPath xp = new JDOMXPath(lf.getXpathSelect());
                        xp.setNamespaceContext(nsc);
                        List selectList = xp.selectNodes(xmlDoc);
                        Iterator i = selectList.iterator();
                        while (i.hasNext()) {
                            LiusField lfoccur = new LiusField();
                            BeanUtils.copyProperties(lfoccur, lf);
                            Object selection = (Object) i.next();
                            if (selection instanceof Element) {
                                Element elem = (Element) selection;
                                if (elem.getText().trim() != null && elem.getText().trim() != "") {
                                    lfoccur.setValue(elem.getText());
                                    resColl.add(lfoccur);
                                }
                            } else if (selection instanceof Attribute) {
                                Attribute att = (Attribute) selection;
                                lfoccur.setValue(att.getValue());
                                resColl.add(lfoccur);
                            } else if (selection instanceof Text) {
                                Text text = (Text) selection;
                                lfoccur.setValue(text.getText());
                                resColl.add(lfoccur);
                            } else if (selection instanceof CDATA) {
                                CDATA cdata = (CDATA) selection;
                                lfoccur.setValue(cdata.getText());
                                resColl.add(lfoccur);
                            } else if (selection instanceof Comment) {
                                Comment com = (Comment) selection;
                                lfoccur.setValue(com.getText());
                                resColl.add(lfoccur);
                            } else if (selection instanceof ProcessingInstruction) {
                                ProcessingInstruction pi = (ProcessingInstruction) selection;
                                lfoccur.setValue(pi.getData());
                                resColl.add(lfoccur);
                            } else if (selection instanceof EntityRef) {
                                EntityRef er = (EntityRef) selection;
                                lfoccur.setValue(er.toString());
                                resColl.add(lfoccur);
                            }
                        }
                    } catch (JaxenException e) {
                        logger.error(e.getMessage());
                    } catch (InvocationTargetException ex) {
                        logger.error(ex.getMessage());
                    } catch (IllegalAccessException ex) {
                        logger.error(ex.getMessage());
                    }

                }
            } else {
                resColl.add(field);
            }
        }

    }

    public List getAllDocumentNs(org.jdom.Document doc) {
        List ls = new ArrayList();
        processChildren(doc.getRootElement(), ls);
        return ls;
    }

    private boolean exist(List nsLs, String nsUri) {
        if (nsLs.isEmpty())
            return false;
        for (int i = 0; i < nsLs.size(); i++) {
            if (((String) nsLs.get(i)).equals(nsUri)) {
                return true;
            }
        }
        return false;
    }

    private void processChildren(Element elem, List ns) {
        Namespace nsCourent = (Namespace) elem.getNamespace();
        String nsUri = (nsCourent.getURI());
        if (!exist(ns, nsUri)) {
            ns.add(nsUri.trim());
            nsc.addNamespace(nsCourent.getPrefix(), nsCourent.getURI());
        }
        List additionalNs = elem.getAdditionalNamespaces();
        if (!additionalNs.isEmpty())
            copyNsList(additionalNs, ns);
        if (elem.getChildren().size() > 0) {
            List elemChildren = elem.getChildren();
            for (int i = 0; i < elemChildren.size(); i++) {
                processChildren((Element) elemChildren.get(i), ns);
            }
        }
    }

    private void copyNsList(List nsElem, List nsRes) {
        for (int i = 0; i < nsElem.size(); i++) {
            Namespace ns = (Namespace) nsElem.get(i);
            nsc.addNamespace(ns.getPrefix(), ns.getURI());
            nsRes.add(ns.getURI().trim());
        }
    }

}