de.ingrid.iplug.sns.SNSIndexingInterface.java Source code

Introduction

Here is the source code for de.ingrid.iplug.sns.SNSIndexingInterface.java
Source

/*
 * **************************************************-
 * Ingrid iPlug SNS
 * ==================================================
 * Copyright (C) 2014 - 2015 wemove digital solutions GmbH
 * ==================================================
 * Licensed under the EUPL, Version 1.1 or  as soon they will be
 * approved by the European Commission - subsequent versions of the
 * EUPL (the "Licence");
 * 
 * You may not use this work except in compliance with the Licence.
 * You may obtain a copy of the Licence at:
 * 
 * http://ec.europa.eu/idabc/eupl5
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the Licence is distributed on an "AS IS" basis,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the Licence for the specific language governing permissions and
 * limitations under the Licence.
 * **************************************************#
 */
/*
 * Copyright (c) 1997-2006 by media style GmbH
 */

package de.ingrid.iplug.sns;

import java.net.MalformedURLException;
import java.net.URL;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import de.ingrid.external.FullClassifyService;
import de.ingrid.external.om.Event;
import de.ingrid.external.om.FullClassifyResult;
import de.ingrid.external.om.Location;
import de.ingrid.external.om.Term;
import de.ingrid.utils.tool.SNSUtil;
import de.ingrid.utils.tool.SpringUtil;

/**
 * An interface to the SN Service. It is mainly used to get some data for indexing.
 */
public class SNSIndexingInterface {

    private static Log log = LogFactory.getLog(SNSIndexingInterface.class);

    private List<Temporal> fTemporal = new ArrayList<Temporal>();

    private List<Wgs84Box> fWgs84Box = new ArrayList<Wgs84Box>();

    private List<String> fLocationNames = new ArrayList<String>();

    private String fLanguage;

    private List<String> fTopicIds = new ArrayList<String>();

    private String fGemeindeKennzifferPrefix = "ags:";

    private final Class<FullClassifyService> _fullClassifyService = null;
    private FullClassifyService fFullClassifyService;

    private List<Term> fTerms = new ArrayList<Term>();
    private List<Location> fLocations = new ArrayList<Location>();
    private List<Event> fEvents = new ArrayList<Event>();

    /**
     * Interface for indexing service connection handling.
     * NOTICE: Now handled via abstract FullClassifyService API, so login parameters ARE IGNORED !!!!
     * (e.g. instead read from sns.properties). But you can set a default language !
     * @param login  IGNORED !!!! Username for the SN service.
     * @param password  IGNORED !!!!Password for the SN service. 
     * @param language The DEFAULT language the result should be e.g. "de" (used if no language is passed).
     * @throws Exception If we cannot connect to the sns server.
     */
    public SNSIndexingInterface(final String login, final String password, final String language) throws Exception {
        this.fLanguage = language;

        SpringUtil springUtil = new SpringUtil("spring/external-services.xml");
        this.fFullClassifyService = springUtil.getBean("fullClassifyService", _fullClassifyService);
    }

    /**
     * IGNORED ! Timeout of IndexingInterface now handled in abstract API ! (e.g. read from sns.properties) 
     * @param timeout The timeout in milliseconds. IGNORED !!!!
     */
    public void setTimeout(final int timeout) {
        //        this.fSNSClient.setTimeout(timeout);
    }

    /**
     * @param prefix
     */
    public void setGemeindeKennzifferPrefix(String prefix) {
        this.fGemeindeKennzifferPrefix = prefix;
    }

    /**
     * All buzzwords to the given document. You must call this method first to get results from
     * <code>getReferencesToTime</code> and <code>getReferencesToSpace</code>.
     * 
     * @param text
     *            The document to analyze.
     * @param maxToAnalyzeWords
     *            The first <code>maxToAnalyzeWords</code> words of the document that should be
     *            analyzed.
     * @param ignoreCase
     *            Set to true ignore case of the document.
     * @return A string array filled with all buzzwords.
     * @throws Exception
     *             If we cannot connect to the sns server.
     */
    public String[] getBuzzwords(final String text, final int maxToAnalyzeWords, boolean ignoreCase)
            throws Exception {
        return getBuzzwords(text, maxToAnalyzeWords, ignoreCase, this.fLanguage);
    }

    /**
     * All buzzwords to the given document. You must call this method first to get results from
     * <code>getReferencesToTime</code> and <code>getReferencesToSpace</code>.
     * 
     * @param text The document to analyze.
     * @param maxToAnalyzeWords The first <code>maxToAnalyzeWords</code> words of the document that should be analyzed.
     * @param ignoreCase Set to true ignore case of the document.
     * @param language The language the text is encoded.
     * @return A string array filled with all buzzwords.
     * @throws Exception If we cannot connect to the sns server.
     */
    public String[] getBuzzwords(final String text, final int maxToAnalyzeWords, boolean ignoreCase,
            String language) throws Exception {
        if (language == null) {
            language = this.fLanguage;
        }

        if (log.isDebugEnabled()) {
            log.debug("     !!!!!!!!!! calling API fullClassifyService.autoClassifyText filter=" + null + " "
                    + language);
        }

        FullClassifyResult classifyResult = fFullClassifyService.autoClassifyText(text, maxToAnalyzeWords,
                ignoreCase, null, new Locale(language));

        initializeFromFullClassifyResult(classifyResult);
        String[] result = getBasenames();

        return result;
    }

    /**
     * All buzzwords to the given URL. You must call this method first to get results from
     * <code>getReferencesToTime</code> and <code>getReferencesToSpace</code>.
     * 
     * @param urlStr The url to analyze.
     * @param maxToAnalyzeWords The first <code>maxToAnalyzeWords</code> words of the document that should be analyzed.
     * @param ignoreCase Set to true ignore case of the document.
     * @param language The language the text is encoded.
     * @return A string array filled with all buzzwords.
     * @throws Exception If we cannot connect to the sns server.
     */
    public String[] getBuzzwordsToUrl(final String urlStr, final int maxToAnalyzeWords, boolean ignoreCase,
            String language) throws Exception {
        URL url = createURL(urlStr);

        if (log.isDebugEnabled()) {
            log.debug("     !!!!!!!!!! calling API fullClassifyService.autoClassifyURL " + url + ", filter=" + null
                    + ", " + language);
        }

        FullClassifyResult classifyResult;
        try {
            classifyResult = fFullClassifyService.autoClassifyURL(url, maxToAnalyzeWords, false, null,
                    new Locale(language));
        } catch (Exception ex) {
            log.warn("Problems autoClassifyURL " + urlStr, ex);
            throw ex;
        }

        initializeFromFullClassifyResult(classifyResult);
        String[] result = getBasenames();

        return result;
    }

    /** Set up initial structures from classification result.
     * @param classifyResult
     */
    private void initializeFromFullClassifyResult(FullClassifyResult classifyResult) {
        // we already filter stuff to be sure we get valid results ! We never know (SNS or GS Soil Services or ...)

        this.fTerms = new ArrayList<Term>();
        for (Term term : classifyResult.getTerms()) {
            if (term == null || term.getName() == null || term.getName().trim().isEmpty()) {
                continue;
            }
            this.fTerms.add(term);
        }

        // GSSoil Gazetteer delivers null locations and names !? So we filter empty elements !
        this.fLocations = new ArrayList<Location>();
        for (Location location : classifyResult.getLocations()) {
            if (location == null || location.getName() == null || location.getName().trim().isEmpty()) {
                continue;
            }
            this.fLocations.add(location);
        }

        this.fEvents = classifyResult.getEvents();
        if (this.fEvents == null) {
            this.fEvents = new ArrayList<Event>();
        }

        // we set these ones to NULL ! So we know not processed yet (set to empty list when processed !)
        this.fTopicIds = null;
        this.fTemporal = null;
        this.fWgs84Box = null;
        this.fLocationNames = null;
    }

    /** Return the BUZZWORDS ! NOTICE: terms, locations AND events ! */
    private String[] getBasenames() {
        // we use set (ordered), so duplicates are removed !!!
        Set<String> result = new LinkedHashSet<String>();

        for (Term term : fTerms) {
            result.add(term.getName());
        }

        for (Location location : fLocations) {
            result.add(location.getName());
        }

        for (Event event : fEvents) {
            // may not be set if NOT using SNS and event is just a time reference !
            if (event.getTitle() == null || event.getTitle().trim().isEmpty()) {
                continue;
            }
            result.add(event.getTitle());
        }

        if (log.isDebugEnabled()) {
            log.debug(
                    "     FULL RESULTS OF CLASSIFICATION (limited number may be written into index fields, see max* in sns.properties)");
            String output = "     buzzwords (Terms, Locations, Events in that order -> \"buzzword\" index field): ";
            for (String buzzword : result) {
                output = output + buzzword + ", ";
            }
            log.debug(output);
        }

        return result.toArray(new String[result.size()]);
    }

    private void getReferences() throws Exception, ParseException {
        // initialize Lists here ! So we can differ whether getReferences() already called !
        fTopicIds = new ArrayList<String>();
        fTemporal = new ArrayList<Temporal>();
        fWgs84Box = new ArrayList<Wgs84Box>();
        fLocationNames = new ArrayList<String>();

        // NOTICE: the fTopicIds go into the "areaid" index field in the SE !
        // so we first process locations and then the terms, see below. 
        for (Location location : fLocations) {
            this.fTopicIds.add(location.getId());
            this.fLocationNames.add(location.getName());

            boolean wgs84BoxSet = false;
            final Wgs84Box wgs84Box = new Wgs84Box(location.getName(), 0, 0, 0, 0, "");
            float[] bbox = location.getBoundingBox();
            if (bbox != null) {
                // we want keep exactly same precision !
                wgs84Box.setX1(Double.parseDouble(Float.toString(bbox[0])));
                wgs84Box.setY1(Double.parseDouble(Float.toString(bbox[1])));
                wgs84Box.setX2(Double.parseDouble(Float.toString(bbox[2])));
                wgs84Box.setY2(Double.parseDouble(Float.toString(bbox[3])));
                wgs84BoxSet = true;
            }
            if (location.getNativeKey() != null) {
                String gemeindekennziffer = SNSUtil.transformSpacialReference(this.fGemeindeKennzifferPrefix,
                        location.getNativeKey());
                if (gemeindekennziffer.startsWith("lawa:")) {
                    gemeindekennziffer = SNSUtil.transformSpacialReference("lawa:", location.getNativeKey());
                }
                wgs84Box.setGemeindekennziffer(gemeindekennziffer);
                wgs84BoxSet = true;
            }

            if (wgs84BoxSet) {
                this.fWgs84Box.add(wgs84Box);
            }
        }

        // NOTICE: the fTopicIds go into the "areaid" index field in the SE !
        // So why add the ids of the terms ? We move this one behind the locations
        // to guarantee locations first in index field (number of entries in index
        // field may be limited)
        for (Term term : fTerms) {
            this.fTopicIds.add(term.getId());
        }

        for (Event event : fEvents) {
            // may be not set if NOT using SNS and event is just a time reference !
            if (event.getId() != null && event.getId().length() > 0) {
                this.fTopicIds.add(event.getId());
            }

            final Temporal temporal = new Temporal();
            temporal.setAt(event.getTimeAt());
            temporal.setFrom(event.getTimeRangeFrom());
            temporal.setTo(event.getTimeRangeTo());
            if (!temporal.isEmpty()) {
                this.fTemporal.add(temporal);
            }
        }
    }

    /**
     * All topic ids to the result.
     * 
     * @return
     * @throws Exception
     * @throws ParseException
     */
    public String[] getTopicIds() throws Exception, ParseException {
        if (this.fTopicIds == null) {
            getReferences();
        }

        // we use set to remove duplicates
        Set<String> ret = new LinkedHashSet<String>();
        for (String topicId : fTopicIds) {
            ret.add(topicId);
        }

        if (log.isDebugEnabled()) {
            String output = "     topicIds (Locations, Terms, Events in that order -> \"areaid\" index field): ";
            for (String topicId : ret) {
                output = output + topicId + ", ";
            }
            log.debug(output);
        }

        return (String[]) ret.toArray(new String[ret.size()]);
    }

    /**
     * All time references to a document that is analyzed by <code>getBuzzwords</code>.
     * 
     * @return Array of strings. It is empty if nothing is found.
     * @throws Exception
     *             If we cannot connect to the sns server.
     * @throws ParseException
     *             If the date cannot be parsed.
     */
    public Temporal[] getReferencesToTime() throws Exception, ParseException {
        if (this.fTemporal == null) {
            getReferences();
        }

        if (log.isDebugEnabled()) {
            String output = "     time (-> \"t0\", \"t1\", \"t2\" index fields, date format 'yyyyMMdd'): ";
            for (Temporal temporal : this.fTemporal) {
                output = output + "at:" + temporal.getAt();
                output = output + " from:" + temporal.getFrom();
                output = output + " to:" + temporal.getTo() + ", ";
            }
            log.debug(output);
        }

        return (Temporal[]) this.fTemporal.toArray(new Temporal[this.fTemporal.size()]);
    }

    /**
     * All coordinate references to a document that is analyzed by <code>getBuzzwords</code>.
     * 
     * @return Array of strings. It is empty if nothing is found.
     * @throws Exception
     *             If we cannot connect to the sns server.
     */
    public Wgs84Box[] getReferencesToSpace() throws Exception {
        if (this.fWgs84Box == null) {
            getReferences();
        }

        if (log.isDebugEnabled()) {
            String output = "     space (-> \"x1\", \"y1\", \"x2\", \"y2\", \"areaid\"(=Gemeindekennziffer) index fields, coordinates will be padded): ";
            for (Wgs84Box bbox : this.fWgs84Box) {
                output = output + "name:'" + bbox.getTopicName() + "'";
                output = output + " x1:" + bbox.getX1();
                output = output + " y1:" + bbox.getY1();
                output = output + " x2:" + bbox.getX2();
                output = output + " y2:" + bbox.getY2();
                output = output + " communityCode:" + bbox.getGemeindekennziffer() + ", ";
            }
            log.debug(output);
        }

        return (Wgs84Box[]) this.fWgs84Box.toArray(new Wgs84Box[this.fWgs84Box.size()]);
    }

    /**
     * The method searches all topics of type <b>location</b> and provides the baseNames. The topics
     * came from the document that was analyzed by a previous
     * {@linkplain #getBuzzwords(String, int, boolean)} call.
     * 
     * @return A set of locations. It is empty, if no location topics are available.
     */
    public Set<String> getLocations() throws Exception {
        if (this.fLocationNames == null) {
            getReferences();
        }

        // we use set to remove duplicates
        Set<String> ret = new LinkedHashSet<String>();
        for (String locationName : fLocationNames) {
            ret.add(locationName);
        }

        if (log.isDebugEnabled()) {
            String output = "     locations (-> \"location\" index field): ";
            for (String location : ret) {
                output = output + location + ", ";
            }
            log.debug(output);
        }

        return ret;
    }

    /** Create URL from url String. Returns null if problems !!! */
    private URL createURL(String urlStr) throws MalformedURLException {
        URL url = null;
        try {
            url = new URL(urlStr);
        } catch (MalformedURLException ex) {
            log.warn("Error building URL " + urlStr, ex);
            throw ex;
        }

        return url;
    }
}