org.sleuthkit.autopsy.keywordsearch.AccountsText.java Source code

Introduction

Here is the source code for org.sleuthkit.autopsy.keywordsearch.AccountsText.java
Source

/*
 * Autopsy Forensic Browser
 *
 * Copyright 2011-2016 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sleuthkit.autopsy.keywordsearch;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang.StringUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrRequest.METHOD;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.Version;

/**
 * Highlights account hits for a given document. Knows about pages and such for
 * the content viewer.
 *
 * Note: This class started as a copy-and-paste of HighlightedText, but it
 * proved too messy to modify HighlightedText to work for accounts also. This
 * and HighlightedText are very similar and could probably use some refactoring
 * to reduce code duplication.
 */
class AccountsText implements IndexedText {

    private static final Logger LOGGER = Logger.getLogger(AccountsText.class.getName());
    private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);

    private static final String HIGHLIGHT_PRE = "<span style='background:yellow'>"; //NON-NLS
    private static final String HIGHLIGHT_POST = "</span>"; //NON-NLS
    private static final String ANCHOR_NAME_PREFIX = AccountsText.class.getName() + "_";

    private static final String INSERT_PREFIX = "<a name='" + ANCHOR_NAME_PREFIX; //NON-NLS
    private static final String INSERT_POSTFIX = "'></a>$0"; //$0 will insert current regex match  //NON-NLS
    private static final Pattern ANCHOR_DETECTION_PATTERN = Pattern.compile(HIGHLIGHT_PRE);

    private static final String HIGHLIGHT_FIELD = LuceneQuery.HIGHLIGHT_FIELD_REGEX;

    private final Server solrServer;
    private final String solrDocumentId;
    private final long solrObjectId;
    private final Integer chunkId;
    private final Set<String> keywords = new HashSet<>();
    private final String displayName;
    private final String queryString;

    private boolean isPageInfoLoaded = false;
    private int numberPagesForFile = 0;
    private int currentPage = 0;
    //list of pages, used for iterating back and forth.  Only stores pages with hits
    private final List<Integer> pages = new ArrayList<>();
    //map from page/chunk to number of hits. value is 0 if not yet known.
    private final LinkedHashMap<Integer, Integer> numberOfHitsPerPage = new LinkedHashMap<>();
    //map from page/chunk number to current hit on that page.
    private final HashMap<Integer, Integer> currentHitPerPage = new HashMap<>();

    @NbBundle.Messages({ "AccountsText.creditCardNumber=Credit Card Number",
            "AccountsText.creditCardNumbers=Credit Card Numbers" })
    AccountsText(String objectId, Set<String> keywords) {
        this.solrDocumentId = objectId;
        this.keywords.addAll(keywords);

        //build the query string
        this.queryString = HIGHLIGHT_FIELD + ":"
                + keywords.stream().map(keyword -> "/.*?" + KeywordSearchUtil.escapeLuceneQuery(keyword) + ".*?/")//surround each "keyword" with match anything regex.
                        .collect(Collectors.joining(" ")); //collect as space separated string

        this.solrServer = KeywordSearch.getServer();

        final int separatorIndex = solrDocumentId.indexOf(Server.CHUNK_ID_SEPARATOR);
        if (-1 == separatorIndex) {
            //no chunk id in solrDocumentId
            this.solrObjectId = Long.parseLong(solrDocumentId);
            this.chunkId = null;
        } else {
            //solrDocumentId includes chunk id
            this.solrObjectId = Long.parseLong(solrDocumentId.substring(0, separatorIndex));
            this.chunkId = Integer.parseInt(solrDocumentId.substring(separatorIndex + 1));
        }

        displayName = keywords.size() == 1 ? Bundle.AccountsText_creditCardNumber()
                : Bundle.AccountsText_creditCardNumbers();
    }

    long getObjectId() {
        return this.solrObjectId;
    }

    @Override
    public int getNumberPages() {
        return this.numberPagesForFile;
    }

    @Override
    public int getCurrentPage() {
        return this.currentPage;
    }

    @Override
    public boolean hasNextPage() {
        return pages.indexOf(this.currentPage) < pages.size() - 1;

    }

    @Override
    public boolean hasPreviousPage() {
        return pages.indexOf(this.currentPage) > 0;

    }

    @Override
    @NbBundle.Messages("AccountsText.nextPage.exception.msg=No next page.")
    public int nextPage() {
        if (hasNextPage()) {
            currentPage = pages.get(pages.indexOf(this.currentPage) + 1);
            return currentPage;
        } else {
            throw new IllegalStateException(Bundle.AccountsText_nextPage_exception_msg());
        }
    }

    @Override
    @NbBundle.Messages("AccountsText.previousPage.exception.msg=No previous page.")
    public int previousPage() {
        if (hasPreviousPage()) {
            currentPage = pages.get(pages.indexOf(this.currentPage) - 1);
            return currentPage;
        } else {
            throw new IllegalStateException(Bundle.AccountsText_previousPage_exception_msg());
        }
    }

    @Override
    public boolean hasNextItem() {
        if (this.currentHitPerPage.containsKey(currentPage)) {
            return this.currentHitPerPage.get(currentPage) < this.numberOfHitsPerPage.get(currentPage);
        } else {
            return false;
        }
    }

    @Override
    public boolean hasPreviousItem() {
        if (this.currentHitPerPage.containsKey(currentPage)) {
            return this.currentHitPerPage.get(currentPage) > 1;
        } else {
            return false;
        }
    }

    @Override
    @NbBundle.Messages("AccountsText.nextItem.exception.msg=No next item.")
    public int nextItem() {
        if (hasNextItem()) {
            return currentHitPerPage.merge(currentPage, 1, Integer::sum);
        } else {
            throw new IllegalStateException(Bundle.AccountsText_nextItem_exception_msg());
        }
    }

    @Override
    @NbBundle.Messages("AccountsText.previousItem.exception.msg=No previous item.")
    public int previousItem() {
        if (hasPreviousItem()) {
            return currentHitPerPage.merge(currentPage, -1, Integer::sum);
        } else {
            throw new IllegalStateException(Bundle.AccountsText_previousItem_exception_msg());
        }
    }

    @Override
    public int currentItem() {
        if (this.currentHitPerPage.containsKey(currentPage)) {
            return currentHitPerPage.get(currentPage);
        } else {
            return 0;
        }
    }

    @Override
    public LinkedHashMap<Integer, Integer> getHitsPages() {
        return this.numberOfHitsPerPage;
    }

    /**
     * Initialize this object with information about which pages/chunks have
     * hits. Multiple calls will not change the initial results.
     */
    synchronized private void loadPageInfo() {
        if (isPageInfoLoaded) {
            return;
        }
        if (chunkId != null) {//if a chunk is specified, only show that chunk/page
            this.numberPagesForFile = 1;
            this.currentPage = chunkId;
            this.numberOfHitsPerPage.put(chunkId, 0);
            this.pages.add(chunkId);
            this.currentHitPerPage.put(chunkId, 0);
        } else {
            try {
                this.numberPagesForFile = solrServer.queryNumFileChunks(this.solrObjectId);
            } catch (KeywordSearchModuleException | NoOpenCoreException ex) {
                LOGGER.log(Level.WARNING, "Could not get number pages for content " + this.solrDocumentId, ex); //NON-NLS
                return;
            }

            //if has chunks, get pages with hits
            TreeSet<Integer> sortedPagesWithHits = new TreeSet<>();
            SolrQuery q = new SolrQuery();
            q.setShowDebugInfo(DEBUG); //debug
            q.setQuery(queryString);
            q.setFields(Server.Schema.ID.toString()); //for this case we only need the document ids
            q.addFilterQuery(
                    Server.Schema.ID.toString() + ":" + this.solrObjectId + Server.CHUNK_ID_SEPARATOR + "*");

            try {
                QueryResponse response = solrServer.query(q, METHOD.POST);
                for (SolrDocument resultDoc : response.getResults()) {
                    final String resultDocumentId = resultDoc.getFieldValue(Server.Schema.ID.toString()).toString();
                    // Put the solr chunk id in the map
                    String resultChunkID = StringUtils.substringAfter(resultDocumentId, Server.CHUNK_ID_SEPARATOR);
                    if (StringUtils.isNotBlank(resultChunkID)) {
                        sortedPagesWithHits.add(Integer.parseInt(resultChunkID));
                    } else {
                        sortedPagesWithHits.add(0);
                    }
                }

            } catch (KeywordSearchModuleException | NoOpenCoreException | NumberFormatException ex) {
                LOGGER.log(Level.WARNING, "Error executing Solr highlighting query: " + keywords, ex); //NON-NLS
            }

            //set page to first page having highlights
            if (sortedPagesWithHits.isEmpty()) {
                this.currentPage = 0;
            } else {
                this.currentPage = sortedPagesWithHits.first();
            }

            for (Integer page : sortedPagesWithHits) {
                numberOfHitsPerPage.put(page, 0); //unknown number of matches in the page
                pages.add(page);
                currentHitPerPage.put(page, 0); //set current hit to 0th
            }
        }

        isPageInfoLoaded = true;
    }

    @Override
    @NbBundle.Messages({ "AccountsText.getMarkup.noMatchMsg="
            + "<html><pre><span style\\\\='background\\\\:yellow'>There were no keyword hits on this page. <br />"
            + "The keyword could have been in the file name."
            + " <br />Advance to another page if present, or to view the original text, choose File Text"
            + " <br />in the drop down menu to the right...</span></pre></html>",
            "AccountsText.getMarkup.queryFailedMsg="
                    + "<html><pre><span style\\\\='background\\\\:yellow'>Failed to retrieve keyword hit results."
                    + " <br />Confirm that Autopsy can connect to the Solr server. "
                    + "<br /></span></pre></html>" })
    public String getText() {
        loadPageInfo(); //inits once

        SolrQuery q = new SolrQuery();
        q.setShowDebugInfo(DEBUG); //debug
        q.addHighlightField(HIGHLIGHT_FIELD);
        q.setQuery(queryString);

        //set the documentID filter
        String queryDocumentID = this.solrObjectId + Server.CHUNK_ID_SEPARATOR + this.currentPage;
        q.addFilterQuery(Server.Schema.ID.toString() + ":" + queryDocumentID);

        //configure the highlighter
        q.setParam("hl.useFastVectorHighlighter", "true"); //fast highlighter scales better than standard one NON-NLS
        q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
        q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
        q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
        q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //docs says makes sense for the original Highlighter only, but not really //NON-NLS

        try {
            //extract highlighting and bail early on null responses
            Map<String, Map<String, List<String>>> highlightingPerDocument = solrServer.query(q, METHOD.POST)
                    .getHighlighting();
            Map<String, List<String>> highlightingPerField = highlightingPerDocument.get(queryDocumentID);
            if (highlightingPerField == null) {
                return Bundle.AccountsText_getMarkup_noMatchMsg();
            }
            List<String> highlights = highlightingPerField.get(HIGHLIGHT_FIELD);
            if (highlights == null) {
                return Bundle.AccountsText_getMarkup_noMatchMsg();
            }

            //There should only be one item
            String highlighting = highlights.get(0).trim();

            /*
             * use regex matcher to iterate over occurences of HIGHLIGHT_PRE,
             * and prepend them with an anchor tag.
             */
            Matcher m = ANCHOR_DETECTION_PATTERN.matcher(highlighting);
            StringBuffer sb = new StringBuffer(highlighting.length());
            int count = 0;
            while (m.find()) {
                count++;
                m.appendReplacement(sb, INSERT_PREFIX + count + INSERT_POSTFIX);
            }
            m.appendTail(sb);

            //store total hits for this page, now that we know it
            this.numberOfHitsPerPage.put(this.currentPage, count);
            if (this.currentItem() == 0 && this.hasNextItem()) {
                this.nextItem();
            }

            // extracted content (minus highlight tags) is HTML-escaped
            return "<html><pre>" + sb.toString() + "</pre></html>"; //NON-NLS
        } catch (Exception ex) {
            LOGGER.log(Level.WARNING, "Error executing Solr highlighting query: " + keywords, ex); //NON-NLS
            return Bundle.AccountsText_getMarkup_queryFailedMsg();
        }
    }

    @Override
    public String toString() {
        return displayName;
    }

    @Override
    public boolean isSearchable() {
        return true;
    }

    @Override
    public String getAnchorPrefix() {
        return ANCHOR_NAME_PREFIX;
    }

    @Override
    public int getNumberHits() {
        if (!this.numberOfHitsPerPage.containsKey(this.currentPage)) {
            return 0;
        }
        return this.numberOfHitsPerPage.get(this.currentPage);
    }
}