Java tutorial
// ============================================================================ // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // https://github.com/Talend/data-prep/blob/master/LICENSE // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dataprep.transformation.service; import static java.util.Collections.singletonList; import static java.util.stream.Collectors.toList; import static org.springframework.http.MediaType.APPLICATION_JSON_VALUE; import static org.springframework.http.MediaType.APPLICATION_OCTET_STREAM_VALUE; import static org.springframework.web.bind.annotation.RequestMethod.DELETE; import static org.springframework.web.bind.annotation.RequestMethod.GET; import static org.springframework.web.bind.annotation.RequestMethod.POST; import static org.talend.daikon.exception.ExceptionContext.build; import static org.talend.dataprep.api.dataset.ColumnMetadata.Builder.column; import static org.talend.dataprep.api.export.ExportParameters.SourceType.HEAD; import static org.talend.dataprep.exception.error.PreparationErrorCodes.PREPARATION_DOES_NOT_EXIST; import static org.talend.dataprep.exception.error.TransformationErrorCodes.UNEXPECTED_EXCEPTION; import static org.talend.dataprep.quality.AnalyzerService.Analysis.SEMANTIC; import static org.talend.dataprep.transformation.actions.category.ScopeCategory.COLUMN; import static org.talend.dataprep.transformation.actions.category.ScopeCategory.LINE; import static org.talend.dataprep.transformation.format.JsonFormat.JSON; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.ObjectOutputStream; import java.io.OutputStream; import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; import java.util.zip.GZIPOutputStream; import javax.annotation.Resource; import javax.validation.Valid; import org.apache.commons.io.output.NullOutputStream; import org.apache.commons.lang.StringUtils; import org.apache.lucene.store.FSDirectory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.task.TaskExecutor; import org.springframework.http.MediaType; import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.RequestBody; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestMethod; import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.ResponseBody; import org.springframework.web.bind.annotation.RestController; import org.springframework.web.context.WebApplicationContext; import org.springframework.web.servlet.mvc.method.annotation.StreamingResponseBody; import org.talend.daikon.exception.ExceptionContext; import org.talend.dataprep.api.action.ActionDefinition; import org.talend.dataprep.api.dataset.ColumnMetadata; import org.talend.dataprep.api.dataset.DataSet; import org.talend.dataprep.api.dataset.DataSetMetadata; import org.talend.dataprep.api.dataset.RowMetadata; import org.talend.dataprep.api.dataset.row.Flag; import org.talend.dataprep.api.dataset.statistics.SemanticDomain; import org.talend.dataprep.api.export.ExportParameters; import org.talend.dataprep.api.preparation.Preparation; import org.talend.dataprep.api.preparation.Step; import org.talend.dataprep.api.preparation.StepDiff; import org.talend.dataprep.cache.ContentCache; import org.talend.dataprep.cache.ContentCacheKey; import org.talend.dataprep.command.dataset.DataSetGet; import org.talend.dataprep.command.dataset.DataSetGetMetadata; import org.talend.dataprep.command.preparation.PreparationDetailsGet; import org.talend.dataprep.conversions.BeanConversionService; import org.talend.dataprep.dataset.StatisticsAdapter; import org.talend.dataprep.exception.TDPException; import org.talend.dataprep.exception.error.CommonErrorCodes; import org.talend.dataprep.exception.error.TransformationErrorCodes; import org.talend.dataprep.exception.json.JsonErrorCodeDescription; import org.talend.dataprep.format.export.ExportFormat; import org.talend.dataprep.format.export.ExportFormatMessage; import org.talend.dataprep.metrics.Timed; import org.talend.dataprep.metrics.VolumeMetered; import org.talend.dataprep.quality.AnalyzerService; import org.talend.dataprep.security.PublicAPI; import org.talend.dataprep.security.SecurityProxy; import org.talend.dataprep.transformation.actions.common.RunnableAction; import org.talend.dataprep.transformation.aggregation.AggregationService; import org.talend.dataprep.transformation.aggregation.api.AggregationParameters; import org.talend.dataprep.transformation.aggregation.api.AggregationResult; import org.talend.dataprep.transformation.api.action.ActionParser; import org.talend.dataprep.transformation.api.action.context.ActionContext; import org.talend.dataprep.transformation.api.action.context.TransformationContext; import org.talend.dataprep.transformation.api.action.dynamic.DynamicType; import org.talend.dataprep.transformation.api.action.dynamic.GenericParameter; import org.talend.dataprep.transformation.api.transformer.TransformerFactory; import org.talend.dataprep.transformation.api.transformer.configuration.Configuration; import org.talend.dataprep.transformation.api.transformer.configuration.PreviewConfiguration; import org.talend.dataprep.transformation.api.transformer.suggestion.Suggestion; import org.talend.dataprep.transformation.api.transformer.suggestion.SuggestionEngine; import org.talend.dataprep.transformation.cache.CacheKeyGenerator; import org.talend.dataprep.transformation.cache.TransformationMetadataCacheKey; import org.talend.dataprep.transformation.pipeline.ActionRegistry; import org.talend.dataprep.transformation.preview.api.PreviewParameters; import org.talend.dataquality.common.inference.Analyzer; import org.talend.dataquality.common.inference.Analyzers; import org.talend.dataquality.semantic.broadcast.BroadcastIndexObject; import com.fasterxml.jackson.core.JsonParser; import io.swagger.annotations.Api; import io.swagger.annotations.ApiOperation; import io.swagger.annotations.ApiParam; @RestController @Api(value = "transformations", basePath = "/transform", description = "Transformations on data") public class TransformationService extends BaseTransformationService { /** * This class' logger. */ private static final Logger LOG = LoggerFactory.getLogger(TransformationService.class); @Autowired private AnalyzerService analyzerService; /** * The Spring application context. */ @Autowired private WebApplicationContext context; /** * All available transformation actions. */ @Autowired private ActionRegistry actionRegistry; /** * he aggregation service. */ @Autowired private AggregationService aggregationService; /** * The action suggestion engine. */ @Autowired private SuggestionEngine suggestionEngine; /** * The transformer factory. */ @Autowired private TransformerFactory factory; /** * Task executor for asynchronous processing. */ @Resource(name = "serializer#json#executor") private TaskExecutor executor; /** * Security proxy enable a thread to borrow the identity of another user. */ @Autowired private SecurityProxy securityProxy; @Autowired private ActionParser actionParser; @Autowired private CacheKeyGenerator cacheKeyGenerator; @Autowired private ContentCache contentCache; @Autowired private BeanConversionService beanConversionService; @Autowired private StatisticsAdapter statisticsAdapter; /** * The root step. */ @Resource(name = "rootStep") private Step rootStep; @RequestMapping(value = "/apply", method = POST, consumes = APPLICATION_JSON_VALUE) @ApiOperation(value = "Run the transformation given the provided export parameters", notes = "This operation transforms the dataset or preparation using parameters in export parameters.") @VolumeMetered public StreamingResponseBody execute( @ApiParam(value = "Preparation id to apply.") @RequestBody @Valid final ExportParameters parameters) { return executeSampleExportStrategy(parameters); } /** * Apply the preparation to the dataset out of the given IDs. * * @param preparationId the preparation id to apply on the dataset. * @param datasetId the dataset id to transform. * @param formatName The output {@link ExportFormat format}. This format also set the MIME response type. * @param stepId the preparation step id to use (default is 'head'). * @param name the transformation name. * @param exportParams additional (optional) export parameters. */ //@formatter:off @RequestMapping(value = "/apply/preparation/{preparationId}/dataset/{datasetId}/{format}", method = GET) @ApiOperation(value = "Transform the given preparation to the given format on the given dataset id", notes = "This operation transforms the dataset using preparation id in the provided format.") @VolumeMetered public StreamingResponseBody applyOnDataset( @ApiParam(value = "Preparation id to apply.") @PathVariable(value = "preparationId") final String preparationId, @ApiParam(value = "DataSet id to transform.") @PathVariable(value = "datasetId") final String datasetId, @ApiParam(value = "Output format") @PathVariable("format") final String formatName, @ApiParam(value = "Step id", defaultValue = "head") @RequestParam(value = "stepId", required = false, defaultValue = "head") final String stepId, @ApiParam(value = "Name of the transformation", defaultValue = "untitled") @RequestParam(value = "name", required = false, defaultValue = "untitled") final String name, @RequestParam final Map<String, String> exportParams) { //@formatter:on final ExportParameters exportParameters = new ExportParameters(); exportParameters.setPreparationId(preparationId); exportParameters.setDatasetId(datasetId); exportParameters.setExportType(formatName); exportParameters.setStepId(stepId); exportParameters.setExportName(name); exportParameters.getArguments().putAll(exportParams); return executeSampleExportStrategy(exportParameters); } /** * Export the dataset to the given format. * * @param datasetId the dataset id to transform. * @param formatName The output {@link ExportFormat format}. This format also set the MIME response type. * @param name the transformation name. * @param exportParams additional (optional) export parameters. */ //@formatter:off @RequestMapping(value = "/export/dataset/{datasetId}/{format}", method = GET) @ApiOperation(value = "Export the given dataset") @Timed public StreamingResponseBody exportDataset( @ApiParam(value = "DataSet id to transform.") @PathVariable(value = "datasetId") final String datasetId, @ApiParam(value = "Output format") @PathVariable("format") final String formatName, @ApiParam(value = "Name of the transformation", defaultValue = "untitled") @RequestParam(value = "name", required = false, defaultValue = "untitled") final String name, @RequestParam final Map<String, String> exportParams) { //@formatter:on return applyOnDataset(null, datasetId, formatName, null, name, exportParams); } /** * Compute the given aggregation. * * @param rawParams the aggregation rawParams as body rawParams. */ // @formatter:off @RequestMapping(value = "/aggregate", method = POST, produces = APPLICATION_JSON_VALUE, consumes = APPLICATION_JSON_VALUE) @ApiOperation(value = "Compute the aggregation according to the request body rawParams", consumes = APPLICATION_JSON_VALUE) @VolumeMetered public AggregationResult aggregate( @ApiParam(value = "The aggregation rawParams in json") @RequestBody final String rawParams) { // @formatter:on // parse the aggregation parameters final AggregationParameters parameters; try { parameters = mapper.readerFor(AggregationParameters.class).readValue(rawParams); LOG.debug("Aggregation requested {}", parameters); } catch (IOException e) { throw new TDPException(CommonErrorCodes.BAD_AGGREGATION_PARAMETERS, e); } InputStream contentToAggregate; // get the content of the preparation (internal call with piped streams) if (StringUtils.isNotBlank(parameters.getPreparationId())) { try { PipedOutputStream temp = new PipedOutputStream(); contentToAggregate = new PipedInputStream(temp); // because of piped streams, processing must be asynchronous Runnable r = () -> { try { final ExportParameters exportParameters = new ExportParameters(); exportParameters.setPreparationId(parameters.getPreparationId()); exportParameters.setDatasetId(parameters.getDatasetId()); if (parameters.getFilter() != null) { exportParameters.setFilter(mapper.readTree(parameters.getFilter())); } exportParameters.setExportType(JSON); exportParameters.setStepId(parameters.getStepId()); final StreamingResponseBody body = executeSampleExportStrategy(exportParameters); body.writeTo(temp); } catch (IOException e) { throw new TDPException(CommonErrorCodes.UNABLE_TO_AGGREGATE, e); } }; executor.execute(r); } catch (IOException e) { throw new TDPException(CommonErrorCodes.UNABLE_TO_AGGREGATE, e); } } else { final DataSetGet dataSetGet = context.getBean(DataSetGet.class, parameters.getDatasetId(), false, true); contentToAggregate = dataSetGet.execute(); } // apply the aggregation try (JsonParser parser = mapper.getFactory().createParser(contentToAggregate)) { final DataSet dataSet = mapper.readerFor(DataSet.class).readValue(parser); return aggregationService.aggregate(parameters, dataSet); } catch (IOException e) { throw new TDPException(CommonErrorCodes.UNABLE_TO_PARSE_JSON, e); } finally { // don't forget to release the connection if (contentToAggregate != null) { try { contentToAggregate.close(); } catch (IOException e) { LOG.warn("Could not close dataset input stream while aggregating", e); } } } } /** * This operation allow client to create a diff between 2 list of actions starting from the same data. For example, * sending: * <ul> * <li>{a1, a2} as old actions</li> * <li>{a1, a2, a3} as new actions</li> * </ul> * ... will highlight changes done by a3. * <p> * To prevent the actions to exceed URL length limit, everything is shipped within via the multipart request body. * * @param previewParameters The preview parameters, encoded in json within the request body. * @param output Where to write the response. */ //@formatter:off @RequestMapping(value = "/transform/preview", method = POST, produces = APPLICATION_JSON_VALUE) @ApiOperation(value = "Preview the transformation on input data", notes = "This operation returns the input data diff between the old and the new transformation actions", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) @VolumeMetered public void transformPreview( @ApiParam(name = "body", value = "Preview parameters.") @RequestBody final PreviewParameters previewParameters, final OutputStream output) { //@formatter:on if (shouldApplyDiffToSampleSource(previewParameters)) { executeDiffOnSample(previewParameters, output); } else { executeDiffOnDataset(previewParameters, output); } } private void executeDiffOnSample(final PreviewParameters previewParameters, final OutputStream output) { final TransformationMetadataCacheKey metadataKey = cacheKeyGenerator.generateMetadataKey( // previewParameters.getPreparationId(), // rootStep.id(), // previewParameters.getSourceType() // ); final ContentCacheKey contentKey = cacheKeyGenerator.generateContentKey( // previewParameters.getDataSetId(), // previewParameters.getPreparationId(), // rootStep.id(), // JSON, // previewParameters.getSourceType() // ); try (final InputStream metadata = contentCache.get(metadataKey); // final InputStream content = contentCache.get(contentKey); // final JsonParser contentParser = mapper.getFactory().createParser(content)) { // build metadata final RowMetadata rowMetadata = mapper.readerFor(RowMetadata.class).readValue(metadata); final DataSetMetadata dataSetMetadata = new DataSetMetadata(); dataSetMetadata.setRowMetadata(rowMetadata); // build dataset final DataSet dataSet = mapper.readerFor(DataSet.class).readValue(contentParser); dataSet.setMetadata(dataSetMetadata); // trigger diff executePreview( // previewParameters.getNewActions(), // previewParameters.getBaseActions(), // previewParameters.getTdpIds(), // dataSet, // output // ); } catch (final IOException e) { throw new TDPException(TransformationErrorCodes.UNABLE_TO_PERFORM_PREVIEW, e); } } private void executeDiffOnDataset(final PreviewParameters previewParameters, final OutputStream output) { // because of dataset records streaming, the dataset content must be within an auto closeable block final DataSetGet dataSetGet = context.getBean(DataSetGet.class, previewParameters.getDataSetId(), false, true); boolean identityReleased = false; securityProxy.asTechnicalUser(); try (final InputStream dataSetContent = dataSetGet.execute(); // final JsonParser parser = mapper.getFactory().createParser(dataSetContent)) { securityProxy.releaseIdentity(); identityReleased = true; final DataSet dataSet = mapper.readerFor(DataSet.class).readValue(parser); executePreview( // previewParameters.getNewActions(), // previewParameters.getBaseActions(), // previewParameters.getTdpIds(), // dataSet, // output // ); } catch (IOException e) { throw new TDPException(TransformationErrorCodes.UNABLE_TO_PERFORM_PREVIEW, e); } finally { // make sure the technical identity is released if (!identityReleased) { securityProxy.releaseIdentity(); } } } private boolean shouldApplyDiffToSampleSource(final PreviewParameters previewParameters) { if (previewParameters.getSourceType() != HEAD && previewParameters.getPreparationId() != null) { final TransformationMetadataCacheKey metadataKey = cacheKeyGenerator.generateMetadataKey( // previewParameters.getPreparationId(), // rootStep.id(), // previewParameters.getSourceType() // ); final ContentCacheKey contentKey = cacheKeyGenerator.generateContentKey( // previewParameters.getDataSetId(), // previewParameters.getPreparationId(), // rootStep.id(), // JSON, // previewParameters.getSourceType() // ); return contentCache.has(metadataKey) && contentCache.has(contentKey); } return false; } /** * Given a list of requested preview, it applies the diff to each one. * A diff is between 2 sets of actions and return the info like created columns ids */ //@formatter:off @RequestMapping(value = "/transform/diff/metadata", method = POST, produces = APPLICATION_JSON_VALUE, consumes = APPLICATION_JSON_VALUE) @ApiOperation(value = "Given a list of requested preview, it applies the diff to each one. A diff is between 2 sets of actions and return the info like created columns ids", notes = "This operation returns the diff metadata", consumes = APPLICATION_JSON_VALUE) @VolumeMetered public Stream<StepDiff> getCreatedColumns( @ApiParam(name = "body", value = "Preview parameters list in json.") @RequestBody final List<PreviewParameters> previewParameters) { return previewParameters.stream().map(this::getCreatedColumns); } @RequestMapping(value = "/preparation/{preparationId}/cache", method = DELETE) @ApiOperation(value = "Evict content entries related to the preparation", notes = "This operation remove content entries related to the preparation.") @VolumeMetered public void evictCache( @ApiParam(value = "Preparation Id.") @PathVariable(value = "preparationId") final String preparationId) { for (final ExportParameters.SourceType sourceType : ExportParameters.SourceType.values()) { evictCache(preparationId, sourceType); } } private void evictCache(final String preparationId, final ExportParameters.SourceType sourceType) { final ContentCacheKey metadataKey = cacheKeyGenerator.metadataBuilder().preparationId(preparationId) .sourceType(sourceType).build(); final ContentCacheKey contentKey = cacheKeyGenerator.contentBuilder().preparationId(preparationId) .sourceType(sourceType).build(); contentCache.evictMatch(metadataKey); contentCache.evictMatch(contentKey); } /** * Compare the results of 2 sets of actions, and return the diff metadata Ex : the created columns ids */ private StepDiff getCreatedColumns(final PreviewParameters previewParameters) { final DataSetGetMetadata dataSetGetMetadata = context.getBean(DataSetGetMetadata.class, previewParameters.getDataSetId()); DataSetMetadata dataSetMetadata = dataSetGetMetadata.execute(); StepDiff stepDiff; TransformationContext context = new TransformationContext(); if (dataSetGetMetadata.isSuccessfulExecution() && dataSetMetadata != null) { RowMetadata metadataBase = dataSetMetadata.getRowMetadata(); RowMetadata metadataAfter = metadataBase.clone(); applyActionsOnMetadata(context, metadataBase, previewParameters.getBaseActions()); applyActionsOnMetadata(context, metadataAfter, previewParameters.getNewActions()); metadataAfter.diff(metadataBase); List<String> createdColumnIds = metadataAfter.getColumns().stream() .filter(c -> Flag.NEW.getValue().equals(c.getDiffFlagValue())).map(ColumnMetadata::getId) .collect(Collectors.toList()); stepDiff = new StepDiff(); stepDiff.setCreatedColumns(createdColumnIds); } else { stepDiff = null; // maybe throw an exception... } return stepDiff; } private void applyActionsOnMetadata(TransformationContext context, RowMetadata metadata, String actionsAsJson) { List<RunnableAction> actions = actionParser.parse(actionsAsJson); ActionContext contextWithMetadata = new ActionContext(context, metadata); for (RunnableAction action : actions) { action.getRowAction().compile(contextWithMetadata); } } /** * Execute the preview and write result in the provided output stream * * @param actions The actions to execute to diff with reference * @param referenceActions The reference actions * @param indexes The record indexes to diff. If null, it will process all records * @param dataSet The dataset (column metadata and records) * @param output The output stream where to write the result */ private void executePreview(final String actions, final String referenceActions, final String indexes, final DataSet dataSet, final OutputStream output) { final PreviewConfiguration configuration = PreviewConfiguration.preview() // .withActions(actions) // .withIndexes(indexes) // .fromReference( // Configuration.builder() // .format(JSON) // .output(output) // .actions(referenceActions) // .build() // ) // .build(); factory.get(configuration).transform(dataSet, configuration); } /** * Get the action dynamic params. */ //@formatter:off @RequestMapping(value = "/transform/suggest/{action}/params", method = POST) @ApiOperation(value = "Get the transformation dynamic parameters", notes = "Returns the transformation parameters.") @Timed public GenericParameter dynamicParams( @ApiParam(value = "Action name.") @PathVariable("action") final String action, @ApiParam(value = "The column id.") @RequestParam(value = "columnId") final String columnId, @ApiParam(value = "Data set content as JSON") final InputStream content) { //@formatter:on final DynamicType actionType = DynamicType.fromAction(action); if (actionType == null) { final ExceptionContext exceptionContext = build().put("name", action); throw new TDPException(TransformationErrorCodes.UNKNOWN_DYNAMIC_ACTION, exceptionContext); } try (JsonParser parser = mapper.getFactory().createParser(content)) { final DataSet dataSet = mapper.readerFor(DataSet.class).readValue(parser); return actionType.getGenerator(context).getParameters(columnId, dataSet); } catch (IOException e) { throw new TDPException(CommonErrorCodes.UNABLE_TO_PARSE_JSON, e); } } /** * Returns all {@link ActionDefinition actions} data prep may apply to a column. Column is optional and only needed to * fill out default parameter values. * * @return A list of {@link ActionDefinition} that can be applied to this column. * @see #suggest(ColumnMetadata, int) */ @RequestMapping(value = "/actions/column", method = POST, consumes = APPLICATION_JSON_VALUE, produces = APPLICATION_JSON_VALUE) @ApiOperation(value = "Return all actions for a column (regardless of column metadata)", notes = "This operation returns an array of actions.") @ResponseBody public Stream<ActionDefinition> columnActions(@RequestBody(required = false) ColumnMetadata column) { return actionRegistry.findAll() // .filter(action -> !"TEST".equals(action.getCategory()) && action.acceptScope(COLUMN)) // .map(am -> column != null ? am.adapt(column) : am); } /** * Suggest what {@link ActionDefinition actions} can be applied to <code>column</code>. * * @param column A {@link ColumnMetadata column} definition. * @param limit An optional limit parameter to return the first <code>limit</code> suggestions. * @return A list of {@link ActionDefinition} that can be applied to this column. * @see #suggest(DataSet) */ @RequestMapping(value = "/suggest/column", method = POST, consumes = APPLICATION_JSON_VALUE, produces = APPLICATION_JSON_VALUE) @ApiOperation(value = "Suggest actions for a given column metadata", notes = "This operation returns an array of suggested actions in decreasing order of importance.") @ResponseBody public Stream<ActionDefinition> suggest(@RequestBody(required = false) ColumnMetadata column, // @ApiParam(value = "How many actions should be suggested at most", defaultValue = "5") @RequestParam(value = "limit", defaultValue = "5", required = false) int limit) { if (column == null) { return Stream.empty(); } // look for all actions applicable to the column type final Stream<Suggestion> suggestions = suggestionEngine .score(actionRegistry.findAll().parallel().filter(am -> am.acceptField(column)), column); return suggestions // .filter(s -> s.getScore() > 0) // Keep only strictly positive score (negative and 0 indicates not applicable) .limit(limit) // .map(Suggestion::getAction) // Get the action for positive suggestions .map(am -> am.adapt(column)); // Adapt default values (e.g. column name) } /** * Returns all {@link ActionDefinition actions} data prep may apply to a line. * * @return A list of {@link ActionDefinition} that can be applied to a line. */ @RequestMapping(value = "/actions/line", method = GET, produces = APPLICATION_JSON_VALUE) @ApiOperation(value = "Return all actions on lines", notes = "This operation returns an array of actions.") @ResponseBody public Stream<ActionDefinition> lineActions() { return actionRegistry.findAll() // .filter(action -> action.acceptScope(LINE)) // .map(action -> action.adapt(LINE)); } /** * Suggest what {@link ActionDefinition actions} can be applied to <code>dataSetMetadata</code>. * * @param dataSet A {@link DataSetMetadata dataset} definition. * @return A list of {@link ActionDefinition} that can be applied to this data set. * @see #suggest(ColumnMetadata, int) */ @RequestMapping(value = "/suggest/dataset", method = POST, consumes = APPLICATION_JSON_VALUE, produces = APPLICATION_JSON_VALUE) @ApiOperation(value = "Suggest actions for a given data set metadata", notes = "This operation returns an array of suggested actions in decreasing order of importance.") @ResponseBody public List<ActionDefinition> suggest(DataSet dataSet) { return Collections.emptyList(); } /** * List all transformation related error codes. */ @RequestMapping(value = "/transform/errors", method = RequestMethod.GET, produces = APPLICATION_JSON_VALUE) @ApiOperation(value = "Get all transformation related error codes.", notes = "Returns the list of all transformation related error codes.") @Timed public Iterable<JsonErrorCodeDescription> listErrors() { // need to cast the typed dataset errors into mock ones to use json parsing List<JsonErrorCodeDescription> errors = new ArrayList<>(TransformationErrorCodes.values().length); for (TransformationErrorCodes code : TransformationErrorCodes.values()) { errors.add(new JsonErrorCodeDescription(code)); } return errors; } /** * Get the available export formats */ @RequestMapping(value = "/export/formats", method = GET) @ApiOperation(value = "Get the available format types") @Timed @PublicAPI public Stream<ExportFormatMessage> exportTypes() { return formatRegistrationService.getExternalFormats() // .sorted(Comparator.comparingInt(ExportFormat::getOrder)) // Enforce strict order. .map(f -> beanConversionService.convert(f, ExportFormatMessage.class)) // .filter(ExportFormatMessage::isEnabled); } /** * Get the available export formats for preparation */ @RequestMapping(value = "/export/formats/preparations/{preparationId}", method = GET) @ApiOperation(value = "Get the available format types for the preparation") @Timed public Stream<ExportFormatMessage> getPreparationExportTypesForPreparation(@PathVariable String preparationId) { final Preparation preparation = getPreparation(preparationId); final DataSetMetadata metadata = context.getBean(DataSetGetMetadata.class, preparation.getDataSetId()) .execute(); return getPreparationExportTypesForDataSet(metadata.getId()); } /** * Get the available export formats for dataset. */ @RequestMapping(value = "/export/formats/datasets/{dataSetId}", method = GET) @ApiOperation(value = "Get the available format types for the preparation") @Timed public Stream<ExportFormatMessage> getPreparationExportTypesForDataSet(@PathVariable String dataSetId) { final DataSetMetadata metadata = context.getBean(DataSetGetMetadata.class, dataSetId).execute(); return formatRegistrationService.getExternalFormats() // .sorted(Comparator.comparingInt(ExportFormat::getOrder)) // Enforce strict order. .filter(ExportFormat::isEnabled) // .filter(f -> f.isCompatible(metadata)) // .map(f -> beanConversionService.convert(f, ExportFormatMessage.class)); } @RequestMapping(value = "/dictionary", method = GET, produces = APPLICATION_OCTET_STREAM_VALUE) @ApiOperation(value = "Get current dictionary (as serialized object).") @Timed public StreamingResponseBody getDictionary() { return outputStream -> { final File dictionaryPath = new File( analyzerService.getIndexesLocation() + "/index/dictionary/default/"); LOG.debug("Returning dictionary at path '{}'", dictionaryPath.getAbsoluteFile()); final File keywordPath = new File(analyzerService.getIndexesLocation() + "/index/keyword/default/"); LOG.debug("Returning keywords at path '{}'", keywordPath.getAbsoluteFile()); // Read lucene directory and build BroadcastIndexObject instance final BroadcastIndexObject dictionary, keyword; try (FSDirectory directory = FSDirectory.open(dictionaryPath)) { dictionary = new BroadcastIndexObject(directory); } try (FSDirectory directory = FSDirectory.open(keywordPath)) { keyword = new BroadcastIndexObject(directory); } // Serialize it to output LOG.debug("Returning dictionaries '{}' / '{}'", dictionary, keyword); Dictionaries result = new Dictionaries(dictionary, keyword); try (ObjectOutputStream oos = new ObjectOutputStream(new GZIPOutputStream(outputStream))) { oos.writeObject(result); } }; } /** * Return the semantic types for a given preparation / column. * * @param preparationId the preparation id. * @param columnId the column id. * @param stepId the step id (optional, if not specified, it's 'head') * @return the semantic types for a given preparation / column. */ @RequestMapping(value = "/preparations/{preparationId}/columns/{columnId}/types", method = GET, produces = APPLICATION_JSON_VALUE) @ApiOperation(value = "list the types of the wanted column", notes = "This list can be used by user to change the column type.") @Timed @PublicAPI public List<SemanticDomain> getPreparationColumnSemanticCategories( @ApiParam(value = "The preparation id") @PathVariable String preparationId, @ApiParam(value = "The column id") @PathVariable String columnId, @ApiParam(value = "The preparation version") @RequestParam(defaultValue = "head") String stepId) { LOG.debug("listing preparation semantic categories for preparation #{} column #{}@{}", preparationId, columnId, stepId); // get the preparation final Preparation preparation = getPreparation(preparationId); // get the step (in case of 'head', the real step id must be found) final String version = StringUtils.equals("head", stepId) ? // preparation.getSteps().get(preparation.getSteps().size() - 1).getId() : stepId; /* * OK, this one is a bit tricky so pay attention. * * To be able to get the semantic types, the analyzer service needs to run on the result of the preparation. * * The result must be found in the cache, so if the preparation is not cached, the preparation is run so that * it gets cached. * * Then, the analyzer service just gets the data from the cache. That's it. */ // generate the cache keys for both metadata & content final ContentCacheKey metadataKey = cacheKeyGenerator.metadataBuilder() // .preparationId(preparationId).stepId(version).sourceType(HEAD).build(); final ContentCacheKey contentKey = cacheKeyGenerator.contentBuilder() // .datasetId(preparation.getDataSetId()).preparationId(preparationId).stepId(version) // .format(JSON).sourceType(HEAD) // .build(); // if the preparation is not cached, let's compute it to have some cache if (!contentCache.has(metadataKey) || !contentCache.has(contentKey)) { addPreparationInCache(preparation, stepId); } // run the analyzer service on the cached content try { final DataSetMetadata metadata = mapper.readerFor(DataSetMetadata.class) .readValue(contentCache.get(metadataKey)); final List<SemanticDomain> semanticDomains = getSemanticDomains(metadata, columnId, contentCache.get(contentKey)); LOG.debug("found {} for preparation #{}, column #{}", semanticDomains, preparationId, columnId); return semanticDomains; } catch (IOException e) { throw new TDPException(UNEXPECTED_EXCEPTION, e); } } /** * Get the preparation from the preparation service. * * @param preparationId the wanted preparation id. * @return the preparation from the preparation service. */ private Preparation getPreparation(@ApiParam(value = "The preparation id") @PathVariable String preparationId) { final Preparation preparation; try { final PreparationDetailsGet details = applicationContext.getBean(PreparationDetailsGet.class, preparationId); preparation = mapper.readerFor(Preparation.class).readValue(details.execute()); } catch (IOException e) { throw new TDPException(PREPARATION_DOES_NOT_EXIST, e, build().put("id", preparationId)); } return preparation; } /** * Return the semantic domains for the given parameters. * * @param metadata the dataset metadata. * @param columnId the column id to analyze. * @param records the dataset records. * @return the semantic domains for the given parameters. * @throws IOException can happen... */ private List<SemanticDomain> getSemanticDomains(DataSetMetadata metadata, String columnId, InputStream records) throws IOException { // copy the column metadata and set the semantic domain forced flag to false to make sure the statistics adapter set all // available domains final ColumnMetadata columnMetadata = column() // .copy(metadata.getRowMetadata().getById(columnId)) // .semanticDomainForce(false) // .build(); final Analyzer<Analyzers.Result> analyzer = analyzerService.build(columnMetadata, SEMANTIC); analyzer.init(); try (final JsonParser parser = mapper.getFactory().createParser(records)) { final DataSet dataSet = mapper.readerFor(DataSet.class).readValue(parser); dataSet.getRecords() // .map(r -> r.get(columnId)) // .forEach(analyzer::analyze); analyzer.end(); } final List<Analyzers.Result> analyzerResult = analyzer.getResult(); statisticsAdapter.adapt(singletonList(columnMetadata), analyzerResult); return columnMetadata.getSemanticDomains(); } /** * Add the following preparation in cache. * * @param preparation the preparation to cache. * @param stepId the preparation step id. */ private void addPreparationInCache(Preparation preparation, String stepId) { final ExportParameters exportParameters = new ExportParameters(); exportParameters.setPreparationId(preparation.getId()); exportParameters.setExportType("JSON"); exportParameters.setStepId(stepId); exportParameters.setDatasetId(preparation.getDataSetId()); final StreamingResponseBody streamingResponseBody = executeSampleExportStrategy(exportParameters); try { // the result is not important here as it will be cached ! streamingResponseBody.writeTo(new NullOutputStream()); } catch (IOException e) { throw new TDPException(UNEXPECTED_EXCEPTION, e); } } }