Java tutorial
/* * Copyright 2009-2015 the CodeLibs Project and the Others. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, * either express or implied. See the License for the specific language * governing permissions and limitations under the License. */ package org.codelibs.fess.transformer; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.time.LocalDateTime; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import jcifs.smb.ACE; import jcifs.smb.SID; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.TikaMetadataKeys; import org.codelibs.core.util.DynamicProperties; import org.codelibs.core.util.StringUtil; import org.codelibs.fess.Constants; import org.codelibs.fess.db.exentity.CrawlingConfig; import org.codelibs.fess.db.exentity.CrawlingConfig.ConfigName; import org.codelibs.fess.helper.CrawlingConfigHelper; import org.codelibs.fess.helper.CrawlingSessionHelper; import org.codelibs.fess.helper.FieldHelper; import org.codelibs.fess.helper.FileTypeHelper; import org.codelibs.fess.helper.LabelTypeHelper; import org.codelibs.fess.helper.PathMappingHelper; import org.codelibs.fess.helper.SambaHelper; import org.codelibs.fess.taglib.FessFunctions; import org.codelibs.fess.util.ComponentUtil; import org.codelibs.robot.RobotCrawlAccessException; import org.codelibs.robot.RobotSystemException; import org.codelibs.robot.client.smb.SmbClient; import org.codelibs.robot.db.cbean.AccessResultDataCB; import org.codelibs.robot.db.exbhv.AccessResultDataBhv; import org.codelibs.robot.entity.AccessResultData; import org.codelibs.robot.entity.ExtractData; import org.codelibs.robot.entity.ResponseData; import org.codelibs.robot.entity.ResultData; import org.codelibs.robot.entity.UrlQueue; import org.codelibs.robot.extractor.Extractor; import org.codelibs.robot.util.CrawlingParameterUtil; import org.codelibs.robot.util.LruHashMap; import org.seasar.framework.container.SingletonS2Container; import org.seasar.framework.util.SerializeUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public abstract class AbstractFessFileTransformer extends AbstractFessXpathTransformer { private static final Logger logger = LoggerFactory // NOPMD .getLogger(AbstractFessFileTransformer.class); public String encoding = null; public String noTitleLabel = "No title."; public int abbreviationMarginLength = 10; public boolean ignoreEmptyContent = false; public int maxTitleLength = 100; public int maxDigestLength = 200; public boolean appendMetaContentToContent = true; public boolean appendBodyContentToContent = true; public boolean enableCache = false; public Map<String, String> parentEncodingMap = Collections .synchronizedMap(new LruHashMap<String, String>(1000)); protected Map<String, String> metaContentMapping; protected abstract Extractor getExtractor(ResponseData responseData); @Override public ResultData transform(final ResponseData responseData) { if (responseData == null || responseData.getResponseBody() == null) { throw new RobotCrawlAccessException("No response body."); } final Extractor extractor = getExtractor(responseData); final InputStream in = responseData.getResponseBody(); final Map<String, String> params = new HashMap<String, String>(); params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData)); final String mimeType = responseData.getMimeType(); params.put(HttpHeaders.CONTENT_TYPE, mimeType); params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet()); final StringBuilder contentMetaBuf = new StringBuilder(1000); final Map<String, Object> dataMap = new HashMap<String, Object>(); final Map<String, Object> metaDataMap = new HashMap<>(); String content; try { final ExtractData extractData = extractor.getText(in, params); content = extractData.getContent(); if (ignoreEmptyContent && StringUtil.isBlank(content)) { return null; } if (logger.isDebugEnabled()) { logger.debug("ExtractData: " + extractData); } // meta for (final String key : extractData.getKeySet()) { final String[] values = extractData.getValues(key); if (values != null) { metaDataMap.put(key, values); if (contentMetaBuf.length() > 0) { contentMetaBuf.append(' '); } final String joinValue = StringUtils.join(values, ' '); if (StringUtil.isNotBlank(joinValue)) { contentMetaBuf.append(joinValue); if (metaContentMapping != null) { final String solrField = metaContentMapping.get(key); if (StringUtil.isNotBlank(solrField)) { if (solrField.endsWith("_m")) { dataMap.put(solrField, values); } else { dataMap.put(solrField, joinValue); } } } } } } } catch (final Exception e) { final RobotCrawlAccessException rcae = new RobotCrawlAccessException( "Could not get a text from " + responseData.getUrl(), e); rcae.setLogLevel(RobotCrawlAccessException.WARN); throw rcae; } finally { IOUtils.closeQuietly(in); } if (content == null) { content = StringUtil.EMPTY; } final String contentMeta = contentMetaBuf.toString(); final ResultData resultData = new ResultData(); resultData.setTransformerName(getName()); final CrawlingSessionHelper crawlingSessionHelper = ComponentUtil.getCrawlingSessionHelper(); final String sessionId = crawlingSessionHelper.getCanonicalSessionId(responseData.getSessionId()); final LocalDateTime documentExpires = crawlingSessionHelper.getDocumentExpires(); final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper(); final SambaHelper sambaHelper = ComponentUtil.getSambaHelper(); final DynamicProperties crawlerProperties = ComponentUtil.getCrawlerProperties(); final boolean useAclAsRole = crawlerProperties.getProperty(Constants.USE_ACL_AS_ROLE, Constants.FALSE) .equals(Constants.TRUE); final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper(); final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId()); final FieldHelper fieldHelper = ComponentUtil.getFieldHelper(); final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper(); String url = responseData.getUrl(); final String indexingTarget = crawlingConfig.getIndexingTarget(url); url = pathMappingHelper.replaceUrl(sessionId, url); final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD); String urlEncoding; final UrlQueue urlQueue = CrawlingParameterUtil.getUrlQueue(); if (urlQueue != null && urlQueue.getEncoding() != null) { urlEncoding = urlQueue.getEncoding(); } else { urlEncoding = responseData.getCharSet(); } // cid final String configId = crawlingConfig.getConfigId(); if (configId != null) { putResultDataBody(dataMap, fieldHelper.configIdField, configId); } // expires if (documentExpires != null) { putResultDataBody(dataMap, fieldHelper.expiresField, FessFunctions.formatDate(documentExpires)); } // segment putResultDataBody(dataMap, fieldHelper.segmentField, sessionId); // content final StringBuilder buf = new StringBuilder(content.length() + 1000); if (appendBodyContentToContent) { buf.append(content); } if (appendMetaContentToContent) { if (buf.length() > 0) { buf.append(' '); } buf.append(contentMeta); } final String body = normalizeContent(buf.toString()); if (StringUtil.isNotBlank(body)) { putResultDataBody(dataMap, fieldHelper.contentField, body); } else { putResultDataBody(dataMap, fieldHelper.contentField, StringUtil.EMPTY); } if (Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fieldHelper.cacheField)) || enableCache) { final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " "); // text cache putResultDataBody(dataMap, fieldHelper.cacheField, cache); putResultDataBody(dataMap, fieldHelper.hasCacheField, Constants.TRUE); } // digest putResultDataBody(dataMap, fieldHelper.digestField, Constants.DIGEST_PREFIX + abbreviate(normalizeContent(content), maxDigestLength)); // title if (!dataMap.containsKey(fieldHelper.titleField)) { if (url.endsWith("/")) { if (StringUtil.isNotBlank(content)) { putResultDataBody(dataMap, fieldHelper.titleField, abbreviate(body, maxTitleLength)); } else { putResultDataBody(dataMap, fieldHelper.titleField, noTitleLabel); } } else { final String u = decodeUrlAsName(url, url.startsWith("file:")); final int pos = u.lastIndexOf('/'); if (pos == -1) { putResultDataBody(dataMap, fieldHelper.titleField, u); } else { putResultDataBody(dataMap, fieldHelper.titleField, u.substring(pos + 1)); } } } // host putResultDataBody(dataMap, fieldHelper.hostField, getHost(url)); // site putResultDataBody(dataMap, fieldHelper.siteField, getSite(url, urlEncoding)); // url putResultDataBody(dataMap, fieldHelper.urlField, url); // created putResultDataBody(dataMap, fieldHelper.createdField, Constants.NOW); // TODO anchor putResultDataBody(dataMap, fieldHelper.anchorField, StringUtil.EMPTY); // mimetype putResultDataBody(dataMap, fieldHelper.mimetypeField, mimeType); if (fileTypeHelper != null) { // filetype putResultDataBody(dataMap, fieldHelper.filetypeField, fileTypeHelper.get(mimeType)); } // contentLength putResultDataBody(dataMap, fieldHelper.contentLengthField, Long.toString(responseData.getContentLength())); // lastModified if (responseData.getLastModified() != null) { putResultDataBody(dataMap, fieldHelper.lastModifiedField, FessFunctions.formatDate(responseData.getLastModified())); } // indexingTarget putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget); // boost putResultDataBody(dataMap, fieldHelper.boostField, crawlingConfig.getDocumentBoost()); // label: labelType final Set<String> labelTypeSet = new HashSet<String>(); for (final String labelType : crawlingConfig.getLabelTypeValues()) { labelTypeSet.add(labelType); } final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper(); labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url)); putResultDataBody(dataMap, fieldHelper.labelField, labelTypeSet); // role: roleType final List<String> roleTypeList = new ArrayList<String>(); for (final String roleType : crawlingConfig.getRoleTypeValues()) { roleTypeList.add(roleType); } if (useAclAsRole && responseData.getUrl().startsWith("smb://")) { final ACE[] aces = (ACE[]) responseData.getMetaDataMap().get(SmbClient.SMB_ACCESS_CONTROL_ENTRIES); if (aces != null) { for (final ACE item : aces) { final SID sid = item.getSID(); roleTypeList.add(sambaHelper.getAccountId(sid)); } if (logger.isDebugEnabled()) { logger.debug("smbUrl:" + responseData.getUrl() + " roleType:" + roleTypeList.toString()); } } } putResultDataBody(dataMap, fieldHelper.roleField, roleTypeList); // TODO date // TODO lang // id putResultDataBody(dataMap, fieldHelper.idField, crawlingSessionHelper.generateId(dataMap)); // parentId String parentUrl = responseData.getParentUrl(); if (StringUtil.isNotBlank(parentUrl)) { parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl); putResultDataBody(dataMap, fieldHelper.urlField, parentUrl); putResultDataBody(dataMap, fieldHelper.parentIdField, crawlingSessionHelper.generateId(dataMap)); putResultDataBody(dataMap, fieldHelper.urlField, url); // set again } // from config final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT); final Map<String, String> metaConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.META); for (final Map.Entry<String, String> entry : metaConfigMap.entrySet()) { final String key = entry.getKey(); final String[] values = entry.getValue().split(","); for (final String value : values) { putResultDataWithTemplate(dataMap, key, metaDataMap.get(value), scriptConfigMap.get(key)); } } final Map<String, String> valueConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.VALUE); for (final Map.Entry<String, String> entry : valueConfigMap.entrySet()) { final String key = entry.getKey(); putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key)); } try { resultData.setData(SerializeUtil.fromObjectToBinary(dataMap)); } catch (final Exception e) { throw new RobotCrawlAccessException("Could not serialize object: " + url, e); } resultData.setEncoding(charsetName); return resultData; } protected String abbreviate(final String str, final int maxWidth) { String newStr = StringUtils.abbreviate(str, maxWidth); try { if (newStr.getBytes(Constants.UTF_8).length > maxWidth + abbreviationMarginLength) { newStr = StringUtils.abbreviate(str, maxWidth / 2); } } catch (final UnsupportedEncodingException e) { // NOP } return newStr; } private String getResourceName(final ResponseData responseData) { String name = responseData.getUrl(); final String enc = responseData.getCharSet(); if (name == null || enc == null) { return null; } name = name.replaceAll("/+$", StringUtil.EMPTY); final int idx = name.lastIndexOf('/'); if (idx >= 0) { name = name.substring(idx + 1); } try { return URLDecoder.decode(name, enc); } catch (final Exception e) { return name; } } protected String decodeUrlAsName(final String url, final boolean escapePlus) { if (url == null) { return null; } String enc = Constants.UTF_8; if (encoding == null) { final UrlQueue urlQueue = CrawlingParameterUtil.getUrlQueue(); if (urlQueue != null) { final String parentUrl = urlQueue.getParentUrl(); if (StringUtil.isNotEmpty(parentUrl)) { final String sessionId = urlQueue.getSessionId(); final String pageEnc = getParentEncoding(parentUrl, sessionId); if (pageEnc != null) { enc = pageEnc; } else if (urlQueue.getEncoding() != null) { enc = urlQueue.getEncoding(); } } } } else { enc = encoding; } final String escapedUrl = escapePlus ? url.replace("+", "%2B") : url; try { return URLDecoder.decode(escapedUrl, enc); } catch (final Exception e) { return url; } } protected String getParentEncoding(final String parentUrl, final String sessionId) { final String key = sessionId + ":" + parentUrl; String enc = parentEncodingMap.get(key); if (enc != null) { return enc; } final AccessResultDataCB cb = new AccessResultDataCB(); cb.query().queryAccessResult().setSessionId_Equal(sessionId); cb.query().queryAccessResult().setUrl_Equal(parentUrl); cb.specify().columnEncoding(); final AccessResultData accessResultData = SingletonS2Container.getComponent(AccessResultDataBhv.class) .selectEntity(cb); if (accessResultData != null && accessResultData.getEncoding() != null) { enc = accessResultData.getEncoding(); parentEncodingMap.put(key, enc); return enc; } return null; } @Override protected String getHost(final String url) { if (StringUtil.isBlank(url)) { return StringUtil.EMPTY; // empty } if (url.startsWith("file:////")) { final String value = decodeUrlAsName(url.substring(9), true); final int pos = value.indexOf('/'); if (pos > 0) { return value.substring(0, pos); } else if (pos == -1) { return value; } else { return "localhost"; } } else if (url.startsWith("file:")) { return "localhost"; } return super.getHost(url); } @Override protected String getSite(final String url, final String encoding) { if (StringUtil.isBlank(url)) { return StringUtil.EMPTY; // empty } if (url.startsWith("file:////")) { final String value = decodeUrlAsName(url.substring(9), true); return StringUtils.abbreviate("\\\\" + value.replace('/', '\\'), maxSiteLength); } else if (url.startsWith("file:")) { final String value = decodeUrlAsName(url.substring(5), true); if (value.length() > 2 && value.charAt(2) == ':') { // Windows return StringUtils.abbreviate(value.substring(1).replace('/', '\\'), maxSiteLength); } else { // Unix return StringUtils.abbreviate(value, maxSiteLength); } } return super.getSite(url, encoding); } @Override public Object getData(final AccessResultData accessResultData) { final byte[] data = accessResultData.getData(); if (data != null) { try { return SerializeUtil.fromBinaryToObject(data); } catch (final Exception e) { throw new RobotSystemException("Could not create an instanced from bytes.", e); } } return new HashMap<String, Object>(); } public void addMetaContentMapping(final String metaname, final String solrField) { if (metaContentMapping == null) { metaContentMapping = new HashMap<String, String>(); } metaContentMapping.put(metaname, solrField); } }