Java tutorial
/* Copyright 2013 Fabian Steeg, hbz. Licensed under the Eclipse Public License 1.0 */ package models; import static org.elasticsearch.index.query.QueryBuilders.boolQuery; import static org.elasticsearch.index.query.QueryBuilders.matchQuery; import java.util.ArrayList; import java.util.Arrays; import java.util.Calendar; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.stream.Collectors; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.elasticsearch.action.search.SearchRequestBuilder; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.action.search.SearchType; import org.elasticsearch.client.Client; import org.elasticsearch.client.transport.TransportClient; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.transport.InetSocketTransportAddress; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.FilterBuilders; import org.elasticsearch.index.query.MatchQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHits; import org.elasticsearch.search.sort.SortBuilders; import org.elasticsearch.search.sort.SortOrder; import com.google.common.base.Joiner; import com.google.common.base.Predicate; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import controllers.Serialization; import models.queries.AbstractIndexQuery; import models.queries.LobidItems; import models.queries.LobidResources; import play.Logger; import play.mvc.Http.Request; import play.mvc.Results.Chunks; import play.mvc.Results.StringChunks; /** * Search documents in an ElasticSearch index. * * @author Fabian Steeg (fsteeg) * @author Pascal Christoph (dr0i) */ public class Search { /** The ElasticSearch server to use. */ public static final InetSocketTransportAddress ES_SERVER = new InetSocketTransportAddress( Index.CONFIG.getString("application.es.server"), Index.CONFIG.getInt("application.es.port")); /** The ElasticSearch cluster to use. */ public static final String ES_CLUSTER_NAME = Index.CONFIG.getString("application.es.cluster"); private static Client productionClient = new TransportClient( ImmutableSettings.settingsBuilder().put("cluster.name", ES_CLUSTER_NAME).build()) .addTransportAddress(ES_SERVER); /** The ElasticSearch client to use. */ public static Client client = productionClient; /* TODO find a better way to inject the client for testing */ /** Required: */ private final Index index; private final Map<Parameter, String> parameters; /** Optional: */ private String field = ""; private String owner = ""; private String set = ""; private int size = 50; private int from = 0; private String type = ""; private String sort = ""; private String scroll = ""; /** * As there is a memory issue within Promise we musn't allow data > RAM. As * experience shows, this is around 4 M docs (dependig on the serialisation). */ public static final int MAX_SCROLL_HITS = 3000000; private List<Document> documents = null; private Long hitCount = null; private static boolean doingScrollScanNow = false; private Chunks.Out<String> messageOut; /** * @param parameters The search parameters (see {@link Index#queries()} ) * @param index The index to search (see {@link Index}) */ public Search(final Map<Parameter, String> parameters, final Index index) { if (parameters == null) { throw new IllegalArgumentException("Can't work with null parameters"); } this.index = index; this.parameters = parameters; } /** * @param newClient The new elasticsearch client to use. */ public static void clientSet(Client newClient) { client = newClient; } /** Reset the elasticsearch client. */ public static void clientReset() { client = productionClient; } /** * Execute the search and return its results. * * @return The documents matching this search */ public List<Document> documents() { if (documents == null) initResults(); return documents; } /** * @return The total number of hits for this search. */ public long totalHits() { if (hitCount == null) initResults(); return hitCount; } private void initResults() { Pair<List<Document>, Long> result = doSearch(); this.documents = result.getLeft(); this.hitCount = result.getRight(); } private Pair<List<Document>, Long> doSearch() { validateSearchParameters(); final QueryBuilder queryBuilder = createQuery(); Logger.trace("Using query: " + queryBuilder); final SearchResponse response = search(queryBuilder, Boolean.getBoolean(parameters.get(Parameter.SCROLL)) ? SearchType.SCAN : SearchType.DFS_QUERY_THEN_FETCH); Logger.trace("Got response: " + response); final SearchHits hits = response.getHits(); final List<Document> docs = asDocuments(hits, fields(parameters)); final Pair<List<Document>, Long> result = new ImmutablePair<>(docs, hits.getTotalHits()); Logger.debug( String.format("Got %s hits overall, created %s matching docs", hits.getTotalHits(), docs.size())); return result; } private List<String> fields(Map<Parameter, String> queries) { return queries.keySet().stream().flatMap(p -> index.queries().get(p).fields().stream()) .collect(Collectors.toList()); } /** * Optional: specify a field to pick from the full result * * @param resultField The field to return as the result * @return this search object (for chaining) */ public Search field(final String resultField) { this.field = resultField; return this; } /** * Optional: specify a resource owner * * @param resourceOwner An ID for the owner of requested resources * @return this search object (for chaining) */ public Search owner(final String resourceOwner) { this.owner = resourceOwner; return this; } /** * Optional: specify a resource set * * @param resourceSet An ID for the set the requested resources should be in * @return this search object (for chaining) */ public Search set(final String resourceSet) { this.set = resourceSet; return this; } /** * Optional: specify the page size * * @param pageFrom The start index of the result set * @param pageSize The size of the result set * @return this search object (for chaining) */ public Search page(final int pageFrom, final int pageSize) { this.from = pageFrom; this.size = pageSize; return this; } /** * Optional: specify a type * * @param resourceType The type of the requested resources * @return this search object (for chaining) */ public Search type(final String resourceType) { this.type = resourceType; return this; } /** * Optional: specify a sort order * * @param sortOrder The sort order: newest, oldest * @return this search object (for chaining) */ public Search sort(final String sortOrder) { this.sort = sortOrder; return this; } /** * Optional: specify doing a scroll scan query * * @param scrollValue The value of the scroll parameter * @return this search object (for chaining) */ public Search scroll(final String scrollValue) { this.scroll = scrollValue; return this; } /** * @return number of hits of the query */ public long getTotalHits() { return (startInitialResponse().getHits().getTotalHits()); } /** * @param request The clients request * @param serialization The wanted serialization of the returned data. * @return the chunks of the elasticsearch scroll scan query */ public Chunks<String> executeScrollScan(final Request request, final Serialization serialization) { validateSearchParameters(); return new StringChunks() { @Override public void onReady(Chunks.Out<String> out) { setMessageOut(out); ExecutorService executorService = Executors.newSingleThreadExecutor(); executorService.execute(new Runnable() { @Override public void run() { doingScrollScanNow = true; bulk(request, serialization); } }); executorService.shutdown(); } }; } /** * * @return if a scroll scan is done right now */ public boolean doingScrollScanNow() { return doingScrollScanNow; } private void bulk(final Request request, final Serialization serialization) { boolean JSON_LD = serialization.equals(Serialization.JSON_LD); boolean RDF_XML = serialization.equals(Serialization.RDF_XML); try { long lastTime = Calendar.getInstance().getTimeInMillis(); SearchResponse searchResponse = startInitialResponse(); final SearchHits hits = getTotalHits(searchResponse); if (JSON_LD) getMessageOut().write("["); if (RDF_XML) getMessageOut().write("<root>"); long cnt = 0; long to = hits.getTotalHits(); String str; while ((str = getHitsAsString(request, searchResponse, JSON_LD, cnt, to, serialization)) != null) { if (JSON_LD) getMessageOut().write(str.substring(1, str.length() - 1)); else getMessageOut().write(str); cnt = cnt + searchResponse.getHits().getHits().length; Logger.info( "Doc " + cnt + " ,sec:" + ((Calendar.getInstance().getTimeInMillis() - lastTime) / 1000)); searchResponse = client.prepareSearchScroll(searchResponse.getScrollId()).setScroll("1m").execute() .actionGet(); // Getting is much faster than serving over http => memory fills up. // So try to break down dynamically (lesser mem, more pausing). // Eventually cancel if pauses took too long (10 secs). long freeMem = Runtime.getRuntime().freeMemory(); // slow down if only 600 MB left if (freeMem < 600 * 1024 * 1024) { long sleep = (long) Math.pow((1D / (freeMem / 1024D / 1024D / 1024D / 7D)), 2); Logger.info( "Free memory low: " + freeMem / 1024 / 1024 + " MB, sleeping for " + sleep + " ms."); if (sleep > 20000) { Logger.warn("nMemory too low. Canceling request!"); getMessageOut().write( "\nMemory too low. Canceling your request! Please contact 'semweb at hbz-nrw.de' or try again (probably some days) later."); break; } Thread.sleep(sleep); } } } catch (InterruptedException e) { e.printStackTrace(); } finally { if (JSON_LD) getMessageOut().write("\n]"); if (RDF_XML) getMessageOut().write("</root>"); getMessageOut().close(); doingScrollScanNow = false; Logger.info("Finished scroll scan dump"); } } private SearchHits getTotalHits(final SearchResponse response) { final SearchHits hits = response.getHits(); final List<Document> docs = asDocuments(hits, fields(parameters)); Logger.info( String.format("Got %s hits overall, created %s matching docs", hits.getTotalHits(), docs.size())); return hits; } private SearchResponse startInitialResponse() { QueryBuilder queryBuilder = createQuery(); Logger.trace("Using scroll query: " + queryBuilder); SearchResponse response = search(queryBuilder, SearchType.SCAN); Logger.trace("Got scroll response: " + response); response = client.prepareSearchScroll(response.getScrollId()).setScroll(TimeValue.timeValueMinutes(1)) .execute().actionGet(); return response; } private String getHitsAsString(final Request request, final SearchResponse response, final boolean JSON_LD, long cnt, long to, Serialization serialization) { if (response.getHits().getHits().length > 0 && cnt <= to) { if (JSON_LD && cnt > 0) getMessageOut().write(",\n"); return controllers.Application.getSerializedResult(asDocuments(response.getHits(), fields(parameters)), index, field, to, false, request, serialization); } return null; } /** * @return The query object for this search */ public QueryBuilder createQuery() { QueryBuilder queryBuilder = boolQueryFromParams(); if (!owner.isEmpty() && !owner.equals("*")) { final QueryBuilder itemQuery = new LobidItems.OwnerQuery().build(owner); queryBuilder = boolQuery().must(queryBuilder).must(itemQuery); } if (!set.isEmpty()) { final QueryBuilder setQuery = new LobidResources.SetQuery().build(set); queryBuilder = boolQuery().must(queryBuilder).must(setQuery); } String typeValues = AbstractIndexQuery.withoutBooleanOperator(type); boolean isAndQuery = AbstractIndexQuery.isAndQuery(type); if (!typeValues.isEmpty()) { BoolQueryBuilder typeQuery = boolQuery(); for (String t : typeValues.split(",")) { MatchQueryBuilder query = matchQuery("@graph.@type", t); typeQuery = isAndQuery ? typeQuery.must(query) : typeQuery.should(query); } queryBuilder = boolQuery().must(queryBuilder).must(typeQuery); } if (!scroll.isEmpty()) { if (this.scroll.matches("\\d{8}")) { final QueryBuilder changedSinceQuery = new LobidResources.ChangedSinceQuery().build(this.scroll); queryBuilder = boolQuery().must(queryBuilder).must(changedSinceQuery); } } if (queryBuilder == null) throw new IllegalStateException( String.format("Could not construct query for queries '%s', owner '%s'", queryBuilder, owner)); return queryBuilder; } private QueryBuilder boolQueryFromParams() { BoolQueryBuilder builder = boolQuery(); for (QueryBuilder q : getQueries()) { builder = builder.must(q); } return builder; } private List<QueryBuilder> getQueries() { List<QueryBuilder> res = new ArrayList<>(); for (Entry<Parameter, String> entry : parameters.entrySet()) { res.add(index.queries().get(entry.getKey()).build(entry.getValue())); } return res; } private void validateSearchParameters() { if (index == null) { throw new IllegalArgumentException( String.format("Invalid index ('%s') - valid indexes: %s", index, Index.values())); } for (Entry<Parameter, String> entry : parameters.entrySet()) { Parameter parameter = entry.getKey(); if (!index.queries().containsKey(parameter)) { throw new IllegalArgumentException( String.format("Invalid parameter ('%s') for specified index ('%s') - valid: %s", parameter, index, index.queries().keySet())); } } if (from < 0) { throw new IllegalArgumentException("Parameter 'from' must be positive"); } if (size > 100) { throw new IllegalArgumentException("Parameter 'size' must be <= 100"); } final List<String> sortSupported = Arrays.asList("newest", "oldest"); if (!sort.isEmpty() && !sortSupported.contains(sort)) { throw new IllegalArgumentException( "Parameter 'sort' must be one of: " + Joiner.on(", ").join(sortSupported)); } } private SearchResponse search(final QueryBuilder queryBuilder, SearchType searchType) { SearchRequestBuilder requestBuilder = client.prepareSearch(index.id()).setSearchType(searchType) .setQuery(queryBuilder).setPostFilter(FilterBuilders.typeFilter(index.type())); if (searchType.equals(SearchType.SCAN)) requestBuilder.setScroll(TimeValue.timeValueMinutes(1)); if (owner.equals("*")) requestBuilder = requestBuilder.setPostFilter(FilterBuilders.existsFilter(// "@graph.http://purl.org/vocab/frbr/core#exemplar.@id")); if (!sort.isEmpty()) { requestBuilder.addSort(SortBuilders.fieldSort("@graph.http://purl.org/dc/terms/issued.@value") .order(sort.equals("newest") ? SortOrder.DESC : SortOrder.ASC).ignoreUnmapped(true)); } final SearchResponse response = requestBuilder.setFrom(from).setSize(size).setExplain(false).execute() .actionGet(); return response; } private List<Document> asDocuments(final SearchHits hits, final List<String> searchFields) { final List<Document> res = new ArrayList<>(); for (SearchHit hit : hits) { try { Hit hitEnum = Hit.of(hit, searchFields); final Document document = new Document(hit.getId(), new String(hit.source()), index, field); res.add(hitEnum.process(parameters.values().iterator().next(), document)); } catch (IllegalArgumentException e) { Logger.error(e.getMessage(), e); } } final Predicate<Document> predicate = doc -> { return doc.matchedField != null; }; return ImmutableList.copyOf(Iterables.filter(res, predicate)); } private Chunks.Out<String> getMessageOut() { return messageOut; } private void setMessageOut(final Chunks.Out<String> messageOut) { this.messageOut = messageOut; } }