/* * Copyright (c) 2006-2013 Nuxeo SA (http://nuxeo.com/) and others. * * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Florent Guillaume * Stephane Lacoin */ package org.nuxeo.ecm.core.storage; import java.util.LinkedList; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.nuxeo.ecm.core.api.Blob; import org.nuxeo.ecm.core.api.ClientException; import org.nuxeo.ecm.core.api.DocumentModel; import org.nuxeo.ecm.core.api.IdRef; import org.nuxeo.ecm.core.api.blobholder.BlobHolder; import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder; import org.nuxeo.ecm.core.convert.api.ConversionException; import org.nuxeo.ecm.core.convert.api.ConversionService; import org.nuxeo.ecm.core.storage.FulltextUpdaterWork.IndexAndText; import org.nuxeo.ecm.core.utils.BlobsExtractor; import org.nuxeo.ecm.core.work.AbstractWork; import org.nuxeo.ecm.core.work.api.Work; import org.nuxeo.ecm.core.work.api.WorkManager; import org.nuxeo.runtime.api.Framework; /** * Work task that does fulltext extraction from the blobs of the given document. * <p> * The extracted fulltext is then passed to the single-threaded * {@link FulltextUpdaterWork}. * <p> * This base abstract class must be subclassed in order to implement the proper * {@link #initFulltextConfigurationAndParser} depending on the storage. * * @since 5.7 */ public abstract class FulltextExtractorWork extends AbstractWork { private static final long serialVersionUID = 1L; private static final Log log = LogFactory.getLog(FulltextExtractorWork.class); protected static final String ANY2TEXT = "any2text"; protected static final String CATEGORY = "fulltextExtractor"; protected static final String TITLE = "fulltextExtractor"; protected final boolean excludeProxies; protected transient FulltextConfiguration fulltextConfiguration; protected transient FulltextParser fulltextParser; public FulltextExtractorWork(String repositoryName, String docId, String id, boolean excludeProxies) { super(id); setDocument(repositoryName, docId); this.excludeProxies = excludeProxies; } @Override public String getCategory() { return CATEGORY; } @Override public String getTitle() { return TITLE; } @Override public void work() throws Exception { initSession(); // if the runtime has shutdown (normally because tests are finished) // this can happen, see NXP-4009 if (session.getPrincipal() == null) { return; } initFulltextConfigurationAndParser(); setStatus("Extracting"); setProgress(Progress.PROGRESS_0_PC); extractBinaryText(); setProgress(Progress.PROGRESS_100_PC); setStatus("Done"); } /** * Initializes the fulltext configuration and parser. * * @since 5.9.5 */ public abstract void initFulltextConfigurationAndParser(); protected void extractBinaryText() throws ClientException { IdRef docRef = new IdRef(docId); if (!session.exists(docRef)) { // doc is gone return; } DocumentModel doc = session.getDocument(docRef); if (excludeProxies && doc.isProxy()) { // VCS proxies don't have any fulltext attached, it's // the target document that carries it return; } if (!fulltextConfiguration.isFulltextIndexable(doc.getType())) { // excluded by config return; } // Iterate on each index to set the binaryText column BlobsExtractor extractor = new BlobsExtractor(); List<IndexAndText> indexesAndText = new LinkedList<IndexAndText>(); for (String indexName : fulltextConfiguration.indexNames) { if (!fulltextConfiguration.indexesAllBinary.contains(indexName) && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) { // nothing to do: index not configured for blob continue; } extractor.setExtractorProperties( fulltextConfiguration.propPathsByIndexBinary.get(indexName), fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName), fulltextConfiguration.indexesAllBinary.contains(indexName)); List<Blob> blobs = extractor.getBlobs(doc); String text = blobsToText(blobs, docId); text = fulltextParser.parse(text, null); indexesAndText.add(new IndexAndText(indexName, text)); } if (!indexesAndText.isEmpty()) { Work work = new FulltextUpdaterWork(repositoryName, docId, false, true, indexesAndText); WorkManager workManager = Framework.getLocalService(WorkManager.class); workManager.schedule(work, true); } } @Override public void cleanUp(boolean ok, Exception e) { super.cleanUp(ok, e); fulltextConfiguration = null; fulltextParser = null; } protected String blobsToText(List<Blob> blobs, String docId) { List<String> strings = new LinkedList<String>(); for (Blob blob : blobs) { try { SimpleBlobHolder bh = new SimpleBlobHolder(blob); BlobHolder result = convert(bh); if (result == null) { continue; } blob = result.getBlob(); if (blob == null) { continue; } String string = new String(blob.getByteArray(), "UTF-8"); // strip '\0 chars from text if (string.indexOf('\0') >= 0) { string = string.replace("\0", " "); } strings.add(string); } catch (Exception e) { String msg = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + docId + ": " + e; log.warn(msg); log.debug(msg, e); continue; } } return StringUtils.join(strings, " "); } protected BlobHolder convert(BlobHolder blobHolder) throws ConversionException { ConversionService conversionService = Framework.getLocalService(ConversionService.class); if (conversionService == null) { log.debug("No ConversionService available"); return null; } return conversionService.convert(ANY2TEXT, blobHolder, null); } }