de.juwimm.cms.search.res.HtmlDocumentLocator.java Source code

Java tutorial

Introduction

Here is the source code for de.juwimm.cms.search.res.HtmlDocumentLocator.java

Source

/**
 * Copyright (c) 2009 Juwi MacMillan Group GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.juwimm.cms.search.res;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringWriter;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.Properties;
import java.util.StringTokenizer;

import org.apache.log4j.Logger;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.springframework.beans.factory.annotation.Autowired;
import org.tizzit.util.XercesHelper;

import de.juwimm.cms.common.Constants;
import de.juwimm.cms.model.ContentHbm;
import de.juwimm.cms.model.ContentHbmDao;
import de.juwimm.cms.model.UnitHbm;
import de.juwimm.cms.model.UnitHbmDao;
import de.juwimm.cms.model.ViewComponentHbm;
import de.juwimm.cms.model.ViewDocumentHbm;
import de.juwimm.cms.safeguard.model.Realm2viewComponentHbm;
import de.juwimm.cms.safeguard.model.Realm2viewComponentHbmDao;
import de.juwimm.cms.safeguard.model.RealmJaasHbm;
import de.juwimm.cms.safeguard.model.RealmJdbcHbm;
import de.juwimm.cms.safeguard.model.RealmLdapHbm;
import de.juwimm.cms.safeguard.model.RealmSimplePwHbm;
import de.juwimm.cms.search.res.html.HTMLParser;

/**
 * Helper for getting injected the Compass-Instances from Spring and offering getters for them
 * @author <a href="mailto:carsten.schalm@juwimm.com">Carsten Schalm</a>
 * company Juwi|MacMillan Group Gmbh, Walsrode, Germany
 * @version $Id$
 */
public class HtmlDocumentLocator {
    private static Logger log = Logger.getLogger(HtmlDocumentLocator.class);
    @Autowired
    private UnitHbmDao unitHbmDao;
    @Autowired
    private ContentHbmDao contentHbmDao;
    @Autowired
    private Realm2viewComponentHbmDao realm2VcHbmDao;

    public Document getResource(File file, String url, Date modifiedDate, ViewComponentHbm vcl, ViewDocumentHbm vdl)
            throws IOException, InterruptedException {
        String cleanUrl = vdl.getLanguage() + "/" + vcl.getPath();
        Document resource = new Document();// resourceFactory.createResource("HtmlSearchValue");
        resource.add(new Field("alias", "HtmlSearchValue", Field.Store.YES, Field.Index.ANALYZED_NO_NORMS));
        resource.add(new Field("url", cleanUrl, Field.Store.YES, Field.Index.ANALYZED));
        resource.add(
                new Field("siteId", vdl.getSite().getSiteId().toString(), Field.Store.YES, Field.Index.ANALYZED));
        resource.add(new Field("language", vdl.getLanguage(), Field.Store.YES, Field.Index.ANALYZED));
        resource.add(new Field("viewtype", vdl.getViewType(), Field.Store.YES, Field.Index.ANALYZED));
        resource.add(
                new Field("modified", DateTools.timeToString(modifiedDate.getTime(), DateTools.Resolution.MINUTE),
                        Field.Store.YES, Field.Index.ANALYZED));
        resource.add(new Field("uid", url, Field.Store.YES, Field.Index.ANALYZED_NO_NORMS));
        resource.add(new Field("viewComponentId", vcl.getViewComponentId().toString(), Field.Store.YES,
                Field.Index.ANALYZED));
        try {
            Integer unitId = vcl.getViewComponentUnit().getUnit4ViewComponent();
            if (unitId != null) {
                UnitHbm unit = unitHbmDao.load(unitId);
                resource.add(
                        new Field("unitId", unit.getUnitId().toString(), Field.Store.YES, Field.Index.ANALYZED));
                resource.add(new Field("unitName", unit.getName(), Field.Store.YES, Field.Index.ANALYZED));
            }
        } catch (Exception exe) {
            if (log.isDebugEnabled())
                log.debug("unitId could not be loaded for vcId: " + vcl.getViewComponentId(), exe);
        }
        try {
            ContentHbm content = contentHbmDao.load(new Integer(vcl.getReference()));
            resource.add(new Field("template", content.getTemplate(), Field.Store.YES, Field.Index.ANALYZED));
        } catch (Exception exe) {
            if (log.isDebugEnabled())
                log.debug("template could not be loaded for vcId: " + vcl.getViewComponentId(), exe);
        }
        if (log.isDebugEnabled())
            log.debug("looking for realm now: ");
        try {
            String realm = getRealm4Vc(vcl);
            if (realm != null) {
                if (log.isDebugEnabled())
                    log.debug("realm is: " + realm);
                resource.add(new Field("realm", realm, Field.Store.YES, Field.Index.ANALYZED));
            } else {
                if (log.isDebugEnabled())
                    log.debug("page not protected - adding 'nullValue' for the search query.");
                resource.add(
                        new Field("realm", Constants.SEARCH_INDEX_NULL, Field.Store.YES, Field.Index.ANALYZED));
            }
        } catch (Exception exe) {
            if (log.isDebugEnabled())
                log.debug("realms could not be loaded for vcId: " + vcl.getViewComponentId(), exe);
        }
        InputStream inputStream = new FileInputStream(file);
        HTMLParser parser = new HTMLParser(inputStream);
        return parseHtml(resource, parser);
    }

    private String getRealm4Vc(ViewComponentHbm vc) {
        if (vc == null)
            return null;
        Realm2viewComponentHbm realm2vcHmb = realm2VcHbmDao.findByViewComponent(vc.getViewComponentId());
        if (realm2vcHmb == null) {
            return getRealm4Vc(vc.getParent());
        }
        RealmJdbcHbm jdbc = realm2vcHmb.getJdbcRealm();
        if (jdbc != null)
            return createRealmRoleCombo("JDBC_" + jdbc.getJdbcRealmId(), realm2vcHmb.getRoleNeeded());
        RealmSimplePwHbm simple = realm2vcHmb.getSimplePwRealm();
        if (simple != null)
            return createRealmRoleCombo("SIMPLEPW_" + simple.getSimplePwRealmId(), realm2vcHmb.getRoleNeeded());
        RealmLdapHbm ldap = realm2vcHmb.getLdapRealm();
        if (ldap != null)
            return createRealmRoleCombo("LDAP_" + ldap.getLdapRealmId(), realm2vcHmb.getRoleNeeded());
        RealmJaasHbm jaas = realm2vcHmb.getJaasRealm();
        if (jaas != null)
            return createRealmRoleCombo("JAAS_" + jaas.getJaasRealmId(), realm2vcHmb.getRoleNeeded());
        return null;
    }

    private String createRealmRoleCombo(String realm, String rolesNeeded) {
        //if (log.isDebugEnabled()) 
        log.info("createRealmRoleCombo realm: " + realm + " roles: " + rolesNeeded);
        if (rolesNeeded == null)
            return realm;
        String result = "";
        String[] roles = null;
        roles = rolesNeeded.split(Constants.SAFEGUARD_ROLE_SEPARATOR);
        for (int i = 0; i < roles.length; i++) {
            log.info("found: >" + roles[i] + "<");
            if (!roles[i].trim().isEmpty()) {
                result = result + " " + realm + "_" + roles[i].trim();
                log.info("result is now: >" + result + "<");
            }
        }
        return result;
    }

    public Document getExternalResource(String url, Reader htmlContent) throws IOException, InterruptedException {
        Document resource = new Document();
        resource.add(new Field("alias", "HtmlSearchValue", Field.Store.YES, Field.Index.ANALYZED_NO_NORMS));
        resource.add(new Field("url", url, Field.Store.YES, Field.Index.ANALYZED));
        resource.add(new Field("uid", url, Field.Store.YES, Field.Index.ANALYZED));
        HTMLParser parser = new HTMLParser(htmlContent);
        return parseHtml(resource, parser);
    }

    public String stripNonValidXMLCharacters(String in) {
        if (in == null)
            return "";
        String stripped = in.replaceAll("[^\\u0009\\u000a\\u000d\\u0020-\\ud7ff\\e0000-\\ufffd]", "")
                .replaceAll("[&<>]", "");
        return stripped;
    }

    private Document parseHtml(Document resource, HTMLParser parser) throws IOException, InterruptedException {
        Reader reader = parser.getReader();
        StringWriter sw = new StringWriter();
        org.apache.commons.io.IOUtils.copy(reader, sw);
        String sresult = sw.toString();

        if (log.isDebugEnabled())
            log.debug("Saving tokenized HTML value into searchengine: " + sresult);
        resource.add(
                new Field("contents", stripNonValidXMLCharacters(sresult), Field.Store.YES, Field.Index.ANALYZED));

        Properties prop = parser.getMetaTags();
        Collection metafields = prop.values();
        String metadata = "";
        Iterator it = metafields.iterator();
        while (it.hasNext()) {
            StringTokenizer st = new StringTokenizer((String) it.next(), ",");
            while (st.hasMoreElements()) {
                String token = st.nextToken().trim();
                metadata += token + " ";
            }
        }
        // tidy the metadata
        metadata = XercesHelper.html2utf8string(metadata);
        resource.add(new Field("metadata", metadata, Field.Store.YES, Field.Index.ANALYZED));

        // Add the summary as a field that is stored and returned with
        // hit documents for display.
        resource.add(new Field("summary", parser.getSummary(), Field.Store.YES, Field.Index.ANALYZED_NO_NORMS));

        // Add the title as a field that it can be searched and that is stored.
        resource.add(new Field("title", parser.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
        reader.close();
        return resource;
    }
}