From 76797f3c8aa7c35a431252446d5d2bea143ced05 Mon Sep 17 00:00:00 2001 From: millmanorama Date: Thu, 12 Jan 2017 15:58:35 +0100 Subject: [PATCH 1/8] move Chunker to its own file; replace BufferedReader with PushbackReader --- .../sleuthkit/autopsy/casemodule/Case.java | 8 +- .../CoordinationService.java | 255 +++++++++--------- .../CoordinationServiceNamespace.java | 4 +- .../autoingest/AutoIngestJobLogger.java | 2 +- .../autoingest/AutoIngestManager.java | 2 +- .../configuration/SharedConfiguration.java | 4 +- .../autopsy/keywordsearch/Chunker.java | 244 +++++++++++++++++ .../autopsy/keywordsearch/Ingester.java | 213 --------------- 8 files changed, 385 insertions(+), 347 deletions(-) create mode 100644 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java diff --git a/Core/src/org/sleuthkit/autopsy/casemodule/Case.java b/Core/src/org/sleuthkit/autopsy/casemodule/Case.java index d681e81b48..3beac08648 100644 --- a/Core/src/org/sleuthkit/autopsy/casemodule/Case.java +++ b/Core/src/org/sleuthkit/autopsy/casemodule/Case.java @@ -1008,7 +1008,7 @@ public class Case implements SleuthkitCase.ErrorObserver { // The shared lock needs to be created on a special thread so it can be released // from the same thread. Future future = getCurrentCaseExecutor().submit(() -> { - currentCaseLock = CoordinationService.getInstance(CoordinationServiceNamespace.getRoot()).tryGetSharedLock(CoordinationService.CategoryNode.CASES, caseDir); + currentCaseLock = CoordinationService.getServiceForNamespace(CoordinationServiceNamespace.getRoot()).tryGetSharedLock(CoordinationService.CategoryNode.CASES, caseDir); if (null == currentCaseLock) { throw new CaseActionException(NbBundle.getMessage(Case.class, "Case.exception.errorLocking", CaseMetadata.getFileExtension())); } @@ -1019,7 +1019,7 @@ public class Case implements SleuthkitCase.ErrorObserver { // The exclusive lock uses the unique case name. // This lock does not need to be on a special thread since it will be released before // leaving this method - exclusiveResourceLock = CoordinationService.getInstance(CoordinationServiceNamespace.getRoot()).tryGetExclusiveLock(CoordinationService.CategoryNode.RESOURCE, + exclusiveResourceLock = CoordinationService.getServiceForNamespace(CoordinationServiceNamespace.getRoot()).tryGetExclusiveLock(CoordinationService.CategoryNode.RESOURCE, dbName, 12, TimeUnit.HOURS); if (null == exclusiveResourceLock) { throw new CaseActionException(NbBundle.getMessage(Case.class, "Case.exception.errorLocking", CaseMetadata.getFileExtension())); @@ -1269,7 +1269,7 @@ public class Case implements SleuthkitCase.ErrorObserver { // The shared lock needs to be created on a special thread so it can be released // from the same thread. Future future = getCurrentCaseExecutor().submit(() -> { - currentCaseLock = CoordinationService.getInstance(CoordinationServiceNamespace.getRoot()).tryGetSharedLock(CoordinationService.CategoryNode.CASES, metadata.getCaseDirectory()); + currentCaseLock = CoordinationService.getServiceForNamespace(CoordinationServiceNamespace.getRoot()).tryGetSharedLock(CoordinationService.CategoryNode.CASES, metadata.getCaseDirectory()); if (null == currentCaseLock) { throw new CaseActionException(NbBundle.getMessage(Case.class, "Case.exception.errorLocking", CaseMetadata.getFileExtension())); } @@ -1280,7 +1280,7 @@ public class Case implements SleuthkitCase.ErrorObserver { // The exclusive lock uses the unique case name // This lock does not need to be on a special thread since it will be released before // leaving this method - exclusiveResourceLock = CoordinationService.getInstance(CoordinationServiceNamespace.getRoot()).tryGetExclusiveLock(CoordinationService.CategoryNode.RESOURCE, + exclusiveResourceLock = CoordinationService.getServiceForNamespace(CoordinationServiceNamespace.getRoot()).tryGetExclusiveLock(CoordinationService.CategoryNode.RESOURCE, metadata.getCaseDatabaseName(), 12, TimeUnit.HOURS); if (null == exclusiveResourceLock) { throw new CaseActionException(NbBundle.getMessage(Case.class, "Case.exception.errorLocking", CaseMetadata.getFileExtension())); diff --git a/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationService.java b/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationService.java index 39c5c25ba6..adc89d4afa 100644 --- a/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationService.java +++ b/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationService.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2015 Basis Technology Corp. + * Copyright 2011-2017 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,129 +18,89 @@ */ package org.sleuthkit.autopsy.coordinationservice; +import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.concurrent.TimeUnit; -import org.apache.zookeeper.CreateMode; -import org.apache.zookeeper.ZooDefs; import org.apache.curator.RetryPolicy; -import org.apache.curator.retry.ExponentialBackoffRetry; import org.apache.curator.framework.CuratorFramework; import org.apache.curator.framework.CuratorFrameworkFactory; import org.apache.curator.framework.recipes.locks.InterProcessMutex; import org.apache.curator.framework.recipes.locks.InterProcessReadWriteLock; +import org.apache.curator.retry.ExponentialBackoffRetry; +import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; -import org.sleuthkit.autopsy.core.UserPreferences; -import java.io.IOException; -import org.apache.zookeeper.WatchedEvent; -import org.apache.zookeeper.ZooKeeper; import org.apache.zookeeper.KeeperException.NoNodeException; +import org.apache.zookeeper.WatchedEvent; +import org.apache.zookeeper.ZooDefs; +import org.apache.zookeeper.ZooKeeper; +import org.sleuthkit.autopsy.core.UserPreferences; /** - * A centralized service for maintaining configuration information and providing - * distributed synchronization using a shared hierarchical namespace of nodes. + * A coordination service for maintaining configuration information and + * providing distributed synchronization using a shared hierarchical namespace + * of nodes. + * + * TODO (JIRA 2205): Simple refactoring for general use. */ public final class CoordinationService { - /** - * Category nodes are the immediate children of the root node of a shared - * hierarchical namespace managed by the coordination service. - */ - public enum CategoryNode { // RJCTODO: Move this to CoordinationServiceNamespace - - CASES("cases"), - MANIFESTS("manifests"), - CONFIG("config"), - RESOURCE("resource"); - - private final String displayName; - - private CategoryNode(String displayName) { - this.displayName = displayName; - } - - public String getDisplayName() { - return displayName; - } - } - - /** - * Exception type thrown by the coordination service. - */ - public final static class CoordinationServiceException extends Exception { - - private static final long serialVersionUID = 1L; - - private CoordinationServiceException(String message) { - super(message); - } - - private CoordinationServiceException(String message, Throwable cause) { - super(message, cause); - } - } - - /** - * An opaque encapsulation of a lock for use in distributed synchronization. - * Instances are obtained by calling a get lock method and must be passed to - * a release lock method. - */ - public static class Lock implements AutoCloseable { - - /** - * This implementation uses the Curator read/write lock. see - * http://curator.apache.org/curator-recipes/shared-reentrant-read-write-lock.html - */ - private final InterProcessMutex interProcessLock; - private final String nodePath; - - private Lock(String nodePath, InterProcessMutex lock) { - this.nodePath = nodePath; - this.interProcessLock = lock; - } - - public String getNodePath() { - return nodePath; - } - - public void release() throws CoordinationServiceException { - try { - this.interProcessLock.release(); - } catch (Exception ex) { - throw new CoordinationServiceException(String.format("Failed to release the lock on %s", nodePath), ex); - } - } - - @Override - public void close() throws CoordinationServiceException { - release(); - } - } - private static CuratorFramework curator = null; private static final Map rootNodesToServices = new HashMap<>(); - private final Map categoryNodeToPath = new HashMap<>(); private static final int SESSION_TIMEOUT_MILLISECONDS = 300000; private static final int CONNECTION_TIMEOUT_MILLISECONDS = 300000; private static final int ZOOKEEPER_SESSION_TIMEOUT_MILLIS = 3000; private static final int ZOOKEEPER_CONNECTION_TIMEOUT_MILLIS = 15000; - private static final int PORT_OFFSET = 1000; + private static final int PORT_OFFSET = 1000; // When run in Solr, ZooKeeper defaults to Solr port + 1000 + private final Map categoryNodeToPath = new HashMap<>(); /** - * Gets an instance of the centralized coordination service for a specific - * namespace. + * Determines if ZooKeeper is accessible with the current settings. Closes + * the connection prior to returning. + * + * @return true if a connection was achieved, false otherwise + * + * @throws InterruptedException + * @throws IOException + */ + private static boolean isZooKeeperAccessible() throws InterruptedException, IOException { + boolean result = false; + Object workerThreadWaitNotifyLock = new Object(); + int zooKeeperServerPort = Integer.valueOf(UserPreferences.getIndexingServerPort()) + PORT_OFFSET; + String connectString = UserPreferences.getIndexingServerHost() + ":" + zooKeeperServerPort; + ZooKeeper zooKeeper = new ZooKeeper(connectString, ZOOKEEPER_SESSION_TIMEOUT_MILLIS, + (WatchedEvent event) -> { + synchronized (workerThreadWaitNotifyLock) { + workerThreadWaitNotifyLock.notify(); + } + }); + synchronized (workerThreadWaitNotifyLock) { + workerThreadWaitNotifyLock.wait(ZOOKEEPER_CONNECTION_TIMEOUT_MILLIS); + } + ZooKeeper.States state = zooKeeper.getState(); + if (state == ZooKeeper.States.CONNECTED || state == ZooKeeper.States.CONNECTEDREADONLY) { + result = true; + } + zooKeeper.close(); + return result; + } + + /** + * Gets a coordination service for a specific namespace. * * @param rootNode The name of the root node that defines the namespace. * - * @return The service for the namespace defined by the root node name. + * @return The coordination service. * - * @throws CoordinationServiceException If an instaNce of the coordination + * @throws CoordinationServiceException If an instance of the coordination * service cannot be created. */ - public static synchronized CoordinationService getInstance(String rootNode) throws CoordinationServiceException { + public static synchronized CoordinationService getServiceForNamespace(String rootNode) throws CoordinationServiceException { + /* + * Connect to ZooKeeper via Curator. + */ if (null == curator) { RetryPolicy retryPolicy = new ExponentialBackoffRetry(1000, 3); - // When run in Solr, ZooKeeper defaults to Solr port + 1000 int zooKeeperServerPort = Integer.valueOf(UserPreferences.getIndexingServerPort()) + PORT_OFFSET; String connectString = UserPreferences.getIndexingServerHost() + ":" + zooKeeperServerPort; curator = CuratorFrameworkFactory.newClient(connectString, SESSION_TIMEOUT_MILLISECONDS, CONNECTION_TIMEOUT_MILLISECONDS, retryPolicy); @@ -157,7 +117,7 @@ public final class CoordinationService { CoordinationService service; try { service = new CoordinationService(rootNode); - } catch (Exception ex) { + } catch (IOException | InterruptedException | KeeperException | CoordinationServiceException ex) { curator = null; throw new CoordinationServiceException("Failed to create coordination service", ex); } @@ -167,15 +127,18 @@ public final class CoordinationService { } /** - * Constructs an instance of the centralized coordination service for a - * specific namespace. + * Constructs an instance of the coordination service for a specific + * namespace. * * @param rootNodeName The name of the root node that defines the namespace. + * + * @throws Exception (calls Curator methods that throw Exception instead of + * more specific exceptions) */ - private CoordinationService(String rootNodeName) throws Exception { + private CoordinationService(String rootNodeName) throws InterruptedException, IOException, KeeperException, CoordinationServiceException { if (false == isZooKeeperAccessible()) { - throw new Exception("Unable to access ZooKeeper"); + throw new CoordinationServiceException("Unable to access ZooKeeper"); } String rootNode = rootNodeName; @@ -191,6 +154,8 @@ public final class CoordinationService { if (ex.code() != KeeperException.Code.NODEEXISTS) { throw ex; } + } catch (Exception ex) { + throw new CoordinationServiceException("Curator experienced an error", ex); } categoryNodeToPath.put(node.getDisplayName(), nodePath); } @@ -385,35 +350,77 @@ public final class CoordinationService { } /** - * Determines if ZooKeeper is accessible with the current settings. Closes - * the connection prior to returning. - * - * @return true if a connection was achieved, false otherwise + * Exception type thrown by the coordination service. */ - private static boolean isZooKeeperAccessible() { - boolean result = false; - Object workerThreadWaitNotifyLock = new Object(); - int zooKeeperServerPort = Integer.valueOf(UserPreferences.getIndexingServerPort()) + PORT_OFFSET; - String connectString = UserPreferences.getIndexingServerHost() + ":" + zooKeeperServerPort; + public final static class CoordinationServiceException extends Exception { - try { - ZooKeeper zooKeeper = new ZooKeeper(connectString, ZOOKEEPER_SESSION_TIMEOUT_MILLIS, - (WatchedEvent event) -> { + private static final long serialVersionUID = 1L; - synchronized (workerThreadWaitNotifyLock) { - workerThreadWaitNotifyLock.notify(); - } - }); - synchronized (workerThreadWaitNotifyLock) { - workerThreadWaitNotifyLock.wait(ZOOKEEPER_CONNECTION_TIMEOUT_MILLIS); - } - ZooKeeper.States state = zooKeeper.getState(); - if (state == ZooKeeper.States.CONNECTED || state == ZooKeeper.States.CONNECTEDREADONLY) { - result = true; - } - zooKeeper.close(); - } catch (InterruptedException | IOException ignored) { + private CoordinationServiceException(String message) { + super(message); + } + + private CoordinationServiceException(String message, Throwable cause) { + super(message, cause); + } + } + + /** + * An opaque encapsulation of a lock for use in distributed synchronization. + * Instances are obtained by calling a get lock method and must be passed to + * a release lock method. + */ + public static class Lock implements AutoCloseable { + + /** + * This implementation uses the Curator read/write lock. see + * http://curator.apache.org/curator-recipes/shared-reentrant-read-write-lock.html + */ + private final InterProcessMutex interProcessLock; + private final String nodePath; + + private Lock(String nodePath, InterProcessMutex lock) { + this.nodePath = nodePath; + this.interProcessLock = lock; + } + + public String getNodePath() { + return nodePath; + } + + public void release() throws CoordinationServiceException { + try { + this.interProcessLock.release(); + } catch (Exception ex) { + throw new CoordinationServiceException(String.format("Failed to release the lock on %s", nodePath), ex); + } + } + + @Override + public void close() throws CoordinationServiceException { + release(); + } + } + + /** + * Category nodes are the immediate children of the root node of a shared + * hierarchical namespace managed by a coordination service. + */ + public enum CategoryNode { + + CASES("cases"), + MANIFESTS("manifests"), + CONFIG("config"), + RESOURCE("resource"); + + private final String displayName; + + private CategoryNode(String displayName) { + this.displayName = displayName; + } + + public String getDisplayName() { + return displayName; } - return result; } } diff --git a/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationServiceNamespace.java b/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationServiceNamespace.java index e1f2a3df42..567dd38bc6 100644 --- a/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationServiceNamespace.java +++ b/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationServiceNamespace.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2015 Basis Technology Corp. + * Copyright 2016-2017 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,7 +19,7 @@ package org.sleuthkit.autopsy.coordinationservice; /** - * Namespace elements for auto ingest coordination service nodes. + * Root node for Autopsy coordination service namespace. */ public final class CoordinationServiceNamespace { private static final String ROOT = "autopsy"; diff --git a/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestJobLogger.java b/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestJobLogger.java index 3165ad23c9..e9406950fd 100644 --- a/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestJobLogger.java +++ b/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestJobLogger.java @@ -431,7 +431,7 @@ final class AutoIngestJobLogger { * log file. */ private void log(MessageCategory category, String message) throws AutoIngestJobLoggerException, InterruptedException { - try (Lock lock = CoordinationService.getInstance(CoordinationServiceNamespace.getRoot()).tryGetExclusiveLock(CoordinationService.CategoryNode.CASES, getLogPath(caseDirectoryPath).toString(), LOCK_TIME_OUT, LOCK_TIME_OUT_UNIT)) { + try (Lock lock = CoordinationService.getServiceForNamespace(CoordinationServiceNamespace.getRoot()).tryGetExclusiveLock(CoordinationService.CategoryNode.CASES, getLogPath(caseDirectoryPath).toString(), LOCK_TIME_OUT, LOCK_TIME_OUT_UNIT)) { if (null != lock) { File logFile = getLogPath(caseDirectoryPath).toFile(); try (PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter(logFile, logFile.exists())), true)) { diff --git a/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestManager.java b/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestManager.java index 867ae631e8..bb55281227 100644 --- a/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestManager.java +++ b/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestManager.java @@ -232,7 +232,7 @@ public final class AutoIngestManager extends Observable implements PropertyChang void startUp() throws AutoIngestManagerStartupException { SYS_LOGGER.log(Level.INFO, "Auto ingest starting"); try { - coordinationService = CoordinationService.getInstance(CoordinationServiceNamespace.getRoot()); + coordinationService = CoordinationService.getServiceForNamespace(CoordinationServiceNamespace.getRoot()); } catch (CoordinationServiceException ex) { throw new AutoIngestManagerStartupException("Failed to get coordination service", ex); } diff --git a/Experimental/src/org/sleuthkit/autopsy/experimental/configuration/SharedConfiguration.java b/Experimental/src/org/sleuthkit/autopsy/experimental/configuration/SharedConfiguration.java index 7599ed4648..653fc65807 100644 --- a/Experimental/src/org/sleuthkit/autopsy/experimental/configuration/SharedConfiguration.java +++ b/Experimental/src/org/sleuthkit/autopsy/experimental/configuration/SharedConfiguration.java @@ -160,7 +160,7 @@ public class SharedConfiguration { File remoteFolder = getSharedFolder(); - try (Lock writeLock = CoordinationService.getInstance(LOCK_ROOT).tryGetExclusiveLock(CoordinationService.CategoryNode.CONFIG, remoteFolder.getAbsolutePath(), 30, TimeUnit.MINUTES)) { + try (Lock writeLock = CoordinationService.getServiceForNamespace(LOCK_ROOT).tryGetExclusiveLock(CoordinationService.CategoryNode.CONFIG, remoteFolder.getAbsolutePath(), 30, TimeUnit.MINUTES)) { if (writeLock == null) { logger.log(Level.INFO, String.format("Failed to lock %s - another node is currently uploading or downloading configuration", remoteFolder.getAbsolutePath())); return SharedConfigResult.LOCKED; @@ -230,7 +230,7 @@ public class SharedConfiguration { File remoteFolder = getSharedFolder(); - try (Lock readLock = CoordinationService.getInstance(LOCK_ROOT).tryGetSharedLock(CoordinationService.CategoryNode.CONFIG, remoteFolder.getAbsolutePath(), 30, TimeUnit.MINUTES)) { + try (Lock readLock = CoordinationService.getServiceForNamespace(LOCK_ROOT).tryGetSharedLock(CoordinationService.CategoryNode.CONFIG, remoteFolder.getAbsolutePath(), 30, TimeUnit.MINUTES)) { if (readLock == null) { return SharedConfigResult.LOCKED; } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java new file mode 100644 index 0000000000..ffe0ed4b0b --- /dev/null +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java @@ -0,0 +1,244 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package org.sleuthkit.autopsy.keywordsearch; + +import com.google.common.base.Utf8; +import java.io.IOException; +import java.io.PushbackReader; +import java.io.Reader; +import java.util.Iterator; +import java.util.NoSuchElementException; +import javax.annotation.concurrent.NotThreadSafe; +import org.sleuthkit.autopsy.coreutils.TextUtil; + +/** + * Encapsulates the content chunking algorithm in an implementation of the + * Iterator interface. Also implements Iterable so it can be used directly in a + * for loop. The base chunk is the part of the chunk before the overlapping + * window. The window will be included at the end of the current chunk as well + * as at the beginning of the next chunk. + */ +@NotThreadSafe +class Chunker implements Iterator, Iterable { + + //Chunking algorithm paramaters-------------------------------------// + /** the maximum size of a chunk, including the window. */ + private static final int MAX_TOTAL_CHUNK_SIZE = 32766; //bytes + /** the minimum to read before we start the process of looking for + * whitespace to break at and creating an overlapping window. */ + private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes + /** The maximum size of the chunk, before the overlapping window, even if we + * couldn't find whitespace to break at. */ + private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes + /** The amount of text we will read through before we give up on finding + * whitespace to break the chunk/window at. */ + private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes + /** The number of characters to read in one go from the Reader. */ + private static final int READ_CHARS_BUFFER_SIZE = 512; //chars + + ////chunker state--------------------------------------------/// + /** The Reader that this chunk reads from, and divides into chunks. It must + * be a buffered reader to ensure that mark/reset are supported. */ + private final PushbackReader reader; + /** The local buffer of characters read from the Reader. */ + private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE]; + /** number of chars read in the most recent read operation. */ + private int charsRead = 0; + + /** The text of the current chunk (so far). */ + private StringBuilder currentChunk; + private StringBuilder currentWindow; + /** the size in bytes of the chunk (so far). */ + private int chunkSizeBytes = 0; + /** the size in chars of the (base) chunk (so far). */ + private int baseChunkSizeChars; + + /** has the chunker found whitespace to break on? */ + private boolean whitespaceFound = false; + /** has the chunker reached the end of the Reader? If so, there are no more + * chunks, and the current chunk does not need a window. */ + private boolean endOfReaderReached = false; + + /** + * Create a Chunker that will chunk the content of the given Reader. + * + * @param reader The content to chunk. + */ + Chunker(Reader reader) { + this.reader = new PushbackReader(reader, 2048); + } + + @Override + public Iterator iterator() { + return this; + } + + @Override + public boolean hasNext() { + return endOfReaderReached == false; + } + + /** + * Sanitize the given StringBuilder by replacing non-UTF-8 characters with + * caret '^' + * + * @param sb the StringBuilder to sanitize + * + * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping + * function? + */ + private static StringBuilder sanitizeToUTF8(StringBuilder sb) { + final int length = sb.length(); + for (int i = 0; i < length; i++) { + if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) { + sb.replace(i, i + 1, "^"); + } + } + return sb; + } + + @Override + public Chunk next() { + if (endOfReaderReached) { + throw new NoSuchElementException("There are no more chunks."); + } + //reset state for the next chunk + currentChunk = new StringBuilder(); + currentWindow = new StringBuilder(); + chunkSizeBytes = 0; + baseChunkSizeChars = 0; + + try { + readBaseChunk(); + baseChunkSizeChars = currentChunk.length(); +// reader.mark(2048); //mark the reader so we can rewind the reader here to begin the next chunk + readWindow(); + + } catch (IOException ioEx) { + throw new RuntimeException("IOException while reading chunk.", ioEx); + } + try { + reader.unread(currentWindow.toString().toCharArray()); +// reader.reset(); //reset the reader the so the next chunk can begin at the position marked above + } catch (IOException ex) { + throw new RuntimeException("IOException while resetting chunk reader.", ex); + } + + if (endOfReaderReached) { + /* if we have reached the end of the content,we won't make another + * overlapping chunk, so the base chunk can be extended to the end. */ + baseChunkSizeChars = currentChunk.length(); + } + //sanitize the text and return a Chunk object, that includes the base chunk length. + return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars); + } + + /** + * Read the base chunk from the reader, and attempt to break at whitespace. + * + * @throws IOException if there is a problem reading from the reader. + */ + private void readBaseChunk() throws IOException { + //read the chunk until the minimum base chunk size + readHelper(MINIMUM_BASE_CHUNK_SIZE - 1024, false, currentChunk); + //keep reading until the maximum base chunk size or white space is reached. + whitespaceFound = false; + readHelper(MAXIMUM_BASE_CHUNK_SIZE - 1024, true, currentChunk); + + } + + /** + * Read the window from the reader, and attempt to break at whitespace. + * + * @throws IOException if there is a problem reading from the reader. + */ + private void readWindow() throws IOException { + //read the window, leaving some room to look for white space to break at. + int windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE - 1024, chunkSizeBytes + 1024); + readHelper(windowEnd, false, currentWindow); + whitespaceFound = false; + //keep reading until the max chunk size, or until whitespace is reached. + windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - 1024, chunkSizeBytes + 1024); + readHelper(windowEnd, true, currentWindow); + } + + /** Helper method that implements reading in a loop. + * + * @param maxBytes The max cummulative length of the content,in + * bytes, to read from the Reader. That is, when + * chunkSizeBytes >= maxBytes stop reading. + * @param inWhiteSpaceBuffer Should the current read stop once whitespace is + * found? + * + * @throws IOException If there is a problem reading from the Reader. + */ + private void readHelper(int maxBytes, boolean inWhiteSpaceBuffer, StringBuilder currentSegment) throws IOException { + //only read one character at a time if we are looking for whitespace. + final int readSize = inWhiteSpaceBuffer ? 1 : READ_CHARS_BUFFER_SIZE; + + //read chars up to maxBytes, whitespaceFound if also inWhiteSpaceBuffer, or we reach the end of the reader. + while ((chunkSizeBytes < maxBytes) + && (false == (inWhiteSpaceBuffer && whitespaceFound)) + && (endOfReaderReached == false)) { + charsRead = reader.read(tempChunkBuf, 0, readSize); + if (-1 == charsRead) { + //this is the last chunk + endOfReaderReached = true; + } else { + + //add read chars to the chunk and update the length. + String chunkSegment = new String(tempChunkBuf, 0, charsRead); + final int segmentSize = Utf8.encodedLength(chunkSegment); + + if (chunkSizeBytes + segmentSize < maxBytes) { + chunkSizeBytes += segmentSize; + currentSegment.append(chunkSegment); + } else { + for (int i = 0; i < charsRead; i++) { + final Character character = tempChunkBuf[i]; + int charSize = Utf8.encodedLength(character.toString()); + if (chunkSizeBytes + charSize < maxBytes + && (false == (inWhiteSpaceBuffer && whitespaceFound))) { + currentSegment.append(character); + chunkSizeBytes += charSize; + if (inWhiteSpaceBuffer) { + //check for whitespace. + whitespaceFound = Character.isWhitespace(character); + } + } else { + reader.unread(tempChunkBuf, i, charsRead - i); + break; + } + } + } + } + } + } +} + +/** + * Represents one chunk as the text in it and the length of the base chunk, in + * chars. + */ +class Chunk { + + private final StringBuilder sb; + private final int chunksize; + + Chunk(StringBuilder sb, int baseChunkLength) { + this.sb = sb; + this.chunksize = baseChunkLength; + } + + @Override + public String toString() { + return sb.toString(); + } + + int getBaseChunkLength() { + return chunksize; + } +} diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index 2e1c6feb2d..9b9ffa689f 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -18,20 +18,15 @@ */ package org.sleuthkit.autopsy.keywordsearch; -import com.google.common.base.Utf8; import java.io.BufferedReader; import java.io.IOException; import java.util.HashMap; -import java.util.Iterator; import java.util.Map; -import java.util.NoSuchElementException; import java.util.logging.Level; -import javax.annotation.concurrent.NotThreadSafe; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.common.SolrInputDocument; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; -import org.sleuthkit.autopsy.coreutils.TextUtil; import org.sleuthkit.autopsy.datamodel.ContentUtils; import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.datamodel.AbstractFile; @@ -370,211 +365,3 @@ class Ingester { } } } - -/** - * Encapsulates the content chunking algorithm in an implementation of the - * Iterator interface. Also implements Iterable so it can be used directly in a - * for loop. The base chunk is the part of the chunk before the overlapping - * window. The window will be included at the end of the current chunk as well - * as at the beginning of the next chunk. - */ -@NotThreadSafe -class Chunker implements Iterator, Iterable { - - //Chunking algorithm paramaters-------------------------------------// - /** the maximum size of a chunk, including the window. */ - private static final int MAX_TOTAL_CHUNK_SIZE = 32766; //bytes - /** the minimum to read before we start the process of looking for - * whitespace to break at and creating an overlapping window. */ - private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes - /** The maximum size of the chunk, before the overlapping window, even if we - * couldn't find whitespace to break at. */ - private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes - /** The amount of text we will read through before we give up on finding - * whitespace to break the chunk/window at. */ - private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes - /** The number of characters to read in one go from the Reader. */ - private static final int READ_CHARS_BUFFER_SIZE = 512; //chars - - ////chunker state--------------------------------------------/// - /** The Reader that this chunk reads from, and divides into chunks. It must - * be a buffered reader to ensure that mark/reset are supported. */ - private final BufferedReader reader; - /** The local buffer of characters read from the Reader. */ - private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE]; - /** number of chars read in the most recent read operation. */ - private int charsRead = 0; - - /** The text of the current chunk (so far). */ - private StringBuilder currentChunk; - /** the size in bytes of the chunk (so far). */ - private int chunkSizeBytes = 0; - /** the size in chars of the (base) chunk (so far). */ - private int baseChunkSizeChars; - - /** has the chunker found whitespace to break on? */ - private boolean whitespaceFound = false; - /** has the chunker reached the end of the Reader? If so, there are no more - * chunks, and the current chunk does not need a window. */ - private boolean endOfReaderReached = false; - - /** - * Create a Chunker that will chunk the content of the given Reader. - * - * @param reader The content to chunk. - */ - Chunker(BufferedReader reader) { - this.reader = reader; - } - - @Override - public Iterator iterator() { - return this; - } - - @Override - public boolean hasNext() { - return endOfReaderReached == false; - } - - /** - * Sanitize the given StringBuilder by replacing non-UTF-8 characters with - * caret '^' - * - * @param sb the StringBuilder to sanitize - * - * //JMTODO: use Charsequence.chars() or codePoints() and then a mapping - * function? - */ - private static StringBuilder sanitizeToUTF8(StringBuilder sb) { - final int length = sb.length(); - for (int i = 0; i < length; i++) { - if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) { - sb.replace(i, i + 1, "^"); - } - } - return sb; - } - - @Override - public Chunk next() { - if (endOfReaderReached) { - throw new NoSuchElementException("There are no more chunks."); - } - //reset state for the next chunk - currentChunk = new StringBuilder(); - chunkSizeBytes = 0; - baseChunkSizeChars = 0; - - try { - readBaseChunk(); - baseChunkSizeChars = currentChunk.length(); - reader.mark(2048); //mark the reader so we can rewind the reader here to begin the next chunk - readWindow(); - } catch (IOException ioEx) { - throw new RuntimeException("IOException while reading chunk.", ioEx); - } - try { - reader.reset(); //reset the reader the so the next chunk can begin at the position marked above - } catch (IOException ex) { - throw new RuntimeException("IOException while resetting chunk reader.", ex); - } - - if (endOfReaderReached) { - /* if we have reached the end of the content,we won't make another - * overlapping chunk, so the base chunk can be extended to the end. */ - baseChunkSizeChars = currentChunk.length(); - } - //sanitize the text and return a Chunk object, that includes the base chunk length. - return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars); - } - - /** - * Read the base chunk from the reader, and attempt to break at whitespace. - * - * @throws IOException if there is a problem reading from the reader. - */ - private void readBaseChunk() throws IOException { - //read the chunk until the minimum base chunk size - readHelper(MINIMUM_BASE_CHUNK_SIZE, false); - //keep reading until the maximum base chunk size or white space is reached. - whitespaceFound = false; - readHelper(MAXIMUM_BASE_CHUNK_SIZE, true); - - } - - /** - * Read the window from the reader, and attempt to break at whitespace. - * - * @throws IOException if there is a problem reading from the reader. - */ - private void readWindow() throws IOException { - //read the window, leaving some room to look for white space to break at. - int windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, chunkSizeBytes + 1024); - readHelper(windowEnd, false); - whitespaceFound = false; - //keep reading until the max chunk size, or until whitespace is reached. - windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE, chunkSizeBytes + 1024); - readHelper(windowEnd, true); - } - - /** Helper method that implements reading in a loop. - * - * @param maxBytes The max cummulative length of the content,in - * bytes, to read from the Reader. That is, when - * chunkSizeBytes >= maxBytes stop reading. - * @param inWhiteSpaceBuffer Should the current read stop once whitespace is - * found? - * - * @throws IOException If there is a problem reading from the Reader. - */ - private void readHelper(int maxBytes, boolean inWhiteSpaceBuffer) throws IOException { - //only read one character at a time if we are looking for whitespace. - final int readSize = inWhiteSpaceBuffer ? 1 : READ_CHARS_BUFFER_SIZE; - - //read chars up to maxBytes, whitespaceFound if also inWhiteSpaceBuffer, or we reach the end of the reader. - while ((chunkSizeBytes < maxBytes) - && (false == (inWhiteSpaceBuffer && whitespaceFound)) - && (endOfReaderReached == false)) { - charsRead = reader.read(tempChunkBuf, 0, readSize); - if (-1 == charsRead) { - //this is the last chunk - endOfReaderReached = true; - } else { - if (inWhiteSpaceBuffer) { - //chec for whitespace. - whitespaceFound = Character.isWhitespace(tempChunkBuf[0]); - } - - //add read chars to the chunk and update the length. - String chunkSegment = new String(tempChunkBuf, 0, charsRead); - chunkSizeBytes += Utf8.encodedLength(chunkSegment); - currentChunk.append(chunkSegment); - } - } - } -} - -/** - * Represents one chunk as the text in it and the length of the base chunk, in - * chars. - */ -class Chunk { - - private final StringBuilder sb; - private final int chunksize; - - Chunk(StringBuilder sb, int baseChunkLength) { - this.sb = sb; - this.chunksize = baseChunkLength; - } - - @Override - public String toString() { - return sb.toString(); - } - - int getBaseChunkLength() { - return chunksize; - } -} From 0a7af485f26ab0de035cf3f9d6e2ff273b352a31 Mon Sep 17 00:00:00 2001 From: millmanorama Date: Fri, 13 Jan 2017 11:27:39 +0100 Subject: [PATCH 2/8] return instead of just break to stop infinite loop --- .../src/org/sleuthkit/autopsy/keywordsearch/Chunker.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java index ffe0ed4b0b..4b108558d0 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java @@ -210,7 +210,7 @@ class Chunker implements Iterator, Iterable { } } else { reader.unread(tempChunkBuf, i, charsRead - i); - break; + return; } } } From 104cbd4dc48606b18a43df992f969bb96e0f7fa4 Mon Sep 17 00:00:00 2001 From: jmillman Date: Fri, 13 Jan 2017 06:55:42 -0500 Subject: [PATCH 3/8] restore space in "Keyword Search" --- .../src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties index 43ba62c3c6..8db9e3167d 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Bundle.properties @@ -199,7 +199,7 @@ KeywordSearchOptionsPanelController.moduleErr.msg1=A module caused an error list KeywordSearchOptionsPanelController.moduleErr.msg2=A module caused an error listening to KeywordSearchOptionsPanelController updates. See log to determine which module. Some data could be incomplete. KeywordSearchQueryManager.pathText.text=Keyword search KeywordSearchResultFactory.progress.saving=Saving results\: {0} -KeywordSearchSettings.moduleName.text=KeywordSearch +KeywordSearchSettings.moduleName.text=Keyword Search KeywordSearchSettings.properties_options.text={0}_Options KeywordSearchSettings.propertiesNSRL.text={0}_NSRL KeywordSearchSettings.propertiesScripts.text={0}_Scripts From 506a3037a42d78079bfbca73170e4cbc34674bd6 Mon Sep 17 00:00:00 2001 From: millmanorama Date: Mon, 16 Jan 2017 15:51:04 +0100 Subject: [PATCH 4/8] finish PushbackReader implementation of Chunker, introduce TextExtractorException --- .../keywordsearch/ArtifactTextExtractor.java | 35 ++-- .../autopsy/keywordsearch/Chunker.java | 163 ++++++++++-------- .../keywordsearch/FileTextExtractor.java | 2 +- .../keywordsearch/HtmlTextExtractor.java | 4 +- .../autopsy/keywordsearch/Ingester.java | 8 +- .../keywordsearch/StringsTextExtractor.java | 2 +- .../autopsy/keywordsearch/TextExtractor.java | 15 +- .../keywordsearch/TikaTextExtractor.java | 7 +- 8 files changed, 135 insertions(+), 101 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java index 07657f9646..962e5ba245 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/ArtifactTextExtractor.java @@ -39,6 +39,7 @@ import org.sleuthkit.datamodel.TskCoreException; * artifact's attributes. */ class ArtifactTextExtractor implements TextExtractor { + static final private Logger logger = Logger.getLogger(ArtifactTextExtractor.class.getName()); /** @@ -82,26 +83,31 @@ class ArtifactTextExtractor implements TextExtractor { } @Override - public boolean isDisabled() { + public boolean isDisabled() { return false; - } + } - @Override - public void logWarning(final String msg, Exception ex) { + @Override + public void logWarning(final String msg, Exception ex) { logger.log(Level.WARNING, msg, ex); //NON-NLS } } - private InputStream getInputStream(BlackboardArtifact artifact) { + private InputStream getInputStream(BlackboardArtifact artifact) throws TextExtractorException { // Concatenate the string values of all attributes into a single // "content" string to be indexed. StringBuilder artifactContents = new StringBuilder(); + Content dataSource = null; try { - Content dataSource = getDataSource(artifact); - if (dataSource == null) { - return null; - } + dataSource = getDataSource(artifact); + } catch (TskCoreException tskCoreException) { + throw new TextExtractorException("Unable to get datasource for artifact: " + artifact.toString(), tskCoreException); + } + if (dataSource == null) { + throw new TextExtractorException("Datasource was null for artifact: " + artifact.toString()); + } + try { for (BlackboardAttribute attribute : artifact.getAttributes()) { artifactContents.append(attribute.getAttributeType().getDisplayName()); artifactContents.append(" : "); @@ -119,18 +125,15 @@ class ArtifactTextExtractor implements TextExtractor { } artifactContents.append(System.lineSeparator()); } - } catch (TskCoreException ex) { - logger.log(Level.SEVERE, "There was a problem getting the atributes for artifact " + artifact.getArtifactID(), ex); - return null; - } - if (artifactContents.length() == 0) { - return null; + } catch (TskCoreException tskCoreException) { + throw new TextExtractorException("Unable to get attributes for artifact: " + artifact.toString(), tskCoreException); } + return IOUtils.toInputStream(artifactContents, StandardCharsets.UTF_8); } @Override - public Reader getReader(BlackboardArtifact source) throws Ingester.IngesterException { + public Reader getReader(BlackboardArtifact source) throws TextExtractorException { return new InputStreamReader(getInputStream(source), StandardCharsets.UTF_8); } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java index 4b108558d0..799c238b26 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java @@ -5,14 +5,15 @@ */ package org.sleuthkit.autopsy.keywordsearch; -import com.google.common.base.Utf8; import java.io.IOException; import java.io.PushbackReader; import java.io.Reader; +import java.nio.charset.StandardCharsets; import java.util.Iterator; import java.util.NoSuchElementException; import javax.annotation.concurrent.NotThreadSafe; import org.sleuthkit.autopsy.coreutils.TextUtil; +import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk; /** * Encapsulates the content chunking algorithm in an implementation of the @@ -56,19 +57,18 @@ class Chunker implements Iterator, Iterable { /** the size in chars of the (base) chunk (so far). */ private int baseChunkSizeChars; - /** has the chunker found whitespace to break on? */ - private boolean whitespaceFound = false; /** has the chunker reached the end of the Reader? If so, there are no more * chunks, and the current chunk does not need a window. */ private boolean endOfReaderReached = false; + /** * Create a Chunker that will chunk the content of the given Reader. * * @param reader The content to chunk. */ Chunker(Reader reader) { - this.reader = new PushbackReader(reader, 2048); + this.reader = new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE); } @Override @@ -114,7 +114,6 @@ class Chunker implements Iterator, Iterable { try { readBaseChunk(); baseChunkSizeChars = currentChunk.length(); -// reader.mark(2048); //mark the reader so we can rewind the reader here to begin the next chunk readWindow(); } catch (IOException ioEx) { @@ -122,7 +121,6 @@ class Chunker implements Iterator, Iterable { } try { reader.unread(currentWindow.toString().toCharArray()); -// reader.reset(); //reset the reader the so the next chunk can begin at the position marked above } catch (IOException ex) { throw new RuntimeException("IOException while resetting chunk reader.", ex); } @@ -143,10 +141,10 @@ class Chunker implements Iterator, Iterable { */ private void readBaseChunk() throws IOException { //read the chunk until the minimum base chunk size - readHelper(MINIMUM_BASE_CHUNK_SIZE - 1024, false, currentChunk); + readHelper(MINIMUM_BASE_CHUNK_SIZE - 1024, currentChunk); //keep reading until the maximum base chunk size or white space is reached. - whitespaceFound = false; - readHelper(MAXIMUM_BASE_CHUNK_SIZE - 1024, true, currentChunk); + + readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE - 1024, currentChunk); } @@ -158,87 +156,110 @@ class Chunker implements Iterator, Iterable { private void readWindow() throws IOException { //read the window, leaving some room to look for white space to break at. int windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE - 1024, chunkSizeBytes + 1024); - readHelper(windowEnd, false, currentWindow); - whitespaceFound = false; + readHelper(windowEnd, currentWindow); //keep reading until the max chunk size, or until whitespace is reached. windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - 1024, chunkSizeBytes + 1024); - readHelper(windowEnd, true, currentWindow); + readToWhiteSpaceHelper(windowEnd, currentWindow); } - /** Helper method that implements reading in a loop. - * - * @param maxBytes The max cummulative length of the content,in - * bytes, to read from the Reader. That is, when - * chunkSizeBytes >= maxBytes stop reading. - * @param inWhiteSpaceBuffer Should the current read stop once whitespace is - * found? - * - * @throws IOException If there is a problem reading from the Reader. - */ - private void readHelper(int maxBytes, boolean inWhiteSpaceBuffer, StringBuilder currentSegment) throws IOException { - //only read one character at a time if we are looking for whitespace. - final int readSize = inWhiteSpaceBuffer ? 1 : READ_CHARS_BUFFER_SIZE; + private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException { - //read chars up to maxBytes, whitespaceFound if also inWhiteSpaceBuffer, or we reach the end of the reader. + //read chars up to maxBytes, or we reach the end of the reader. while ((chunkSizeBytes < maxBytes) - && (false == (inWhiteSpaceBuffer && whitespaceFound)) && (endOfReaderReached == false)) { - charsRead = reader.read(tempChunkBuf, 0, readSize); + charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE); if (-1 == charsRead) { //this is the last chunk endOfReaderReached = true; + return; } else { + //if the last charcter might be part of a surroate pair, unread it. + final char lastChar = tempChunkBuf[charsRead - 1]; + String chunkSegment; + if (Character.isHighSurrogate(lastChar)) { + charsRead--; + chunkSegment = new String(tempChunkBuf, 0, charsRead); + reader.unread(lastChar); + } else { + chunkSegment = new String(tempChunkBuf, 0, charsRead); + } //add read chars to the chunk and update the length. - String chunkSegment = new String(tempChunkBuf, 0, charsRead); - final int segmentSize = Utf8.encodedLength(chunkSegment); + int segmentSize = chunkSegment.getBytes(StandardCharsets.UTF_8).length; if (chunkSizeBytes + segmentSize < maxBytes) { - chunkSizeBytes += segmentSize; currentSegment.append(chunkSegment); + chunkSizeBytes = currentSegment.toString().getBytes(StandardCharsets.UTF_8).length; } else { - for (int i = 0; i < charsRead; i++) { - final Character character = tempChunkBuf[i]; - int charSize = Utf8.encodedLength(character.toString()); - if (chunkSizeBytes + charSize < maxBytes - && (false == (inWhiteSpaceBuffer && whitespaceFound))) { - currentSegment.append(character); - chunkSizeBytes += charSize; - if (inWhiteSpaceBuffer) { - //check for whitespace. - whitespaceFound = Character.isWhitespace(character); - } - } else { - reader.unread(tempChunkBuf, i, charsRead - i); - return; - } - } + reader.unread(tempChunkBuf, 0, charsRead); + return; } } } } -} - -/** - * Represents one chunk as the text in it and the length of the base chunk, in - * chars. - */ -class Chunk { - - private final StringBuilder sb; - private final int chunksize; - - Chunk(StringBuilder sb, int baseChunkLength) { - this.sb = sb; - this.chunksize = baseChunkLength; - } - - @Override - public String toString() { - return sb.toString(); - } - - int getBaseChunkLength() { - return chunksize; - } + + private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentSegment) throws IOException { + boolean whitespaceFound = false; + //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader. + while ((chunkSizeBytes < maxBytes) + && (whitespaceFound == false) + && (endOfReaderReached == false)) { + charsRead = reader.read(tempChunkBuf, 0, 1); + if (-1 == charsRead) { + //this is the last chunk + endOfReaderReached = true; + return; + } else { + //if the last charcter might be part of a surroate pair, read another char + final char ch = tempChunkBuf[0]; + String chunkSegment; + if (Character.isHighSurrogate(ch)) { + charsRead = reader.read(tempChunkBuf, 0, 1); + if (charsRead == -1) { + currentSegment.append(ch); + chunkSizeBytes = currentSegment.toString().getBytes(StandardCharsets.UTF_8).length; + //this is the last chunk + endOfReaderReached = true; + return; + } else { + chunkSegment = new String(new char[]{ch, tempChunkBuf[0]}); + } + } else { + chunkSegment = new String(tempChunkBuf, 0, 1); + } + //check for whitespace. + whitespaceFound = Character.isWhitespace(chunkSegment.codePointAt(0)); + //add read chars to the chunk and update the length. + currentSegment.append(chunkSegment); + /* this is wrong once we are in the white space but should have + * no negative effect */ + chunkSizeBytes = currentSegment.toString().getBytes(StandardCharsets.UTF_8).length; + } + } + } + + /** + * Represents one chunk as the text in it and the length of the base chunk, + * in chars. + */ + static class Chunk { + + private final StringBuilder sb; + private final int chunksize; + + Chunk(StringBuilder sb, int baseChunkLength) { + this.sb = sb; + this.chunksize = baseChunkLength; + } + + @Override + public String toString() { + return sb.toString(); + } + + int getBaseChunkLength() { + return chunksize; + } + } + } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java index 55838f4e7f..689f42591b 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/FileTextExtractor.java @@ -95,7 +95,7 @@ abstract class FileTextExtractor implements TextExtractor< AbstractFile> { abstract boolean isSupported(AbstractFile file, String detectedFormat); @Override - public abstract Reader getReader(AbstractFile source) throws Ingester.IngesterException; + public abstract Reader getReader(AbstractFile source) throws TextExtractorException; @Override public long getID(AbstractFile source) { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java index e758ef86a0..f72b02d1eb 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java @@ -65,7 +65,7 @@ class HtmlTextExtractor extends FileTextExtractor { } @Override - public Reader getReader(AbstractFile sourceFile) throws Ingester.IngesterException { + public Reader getReader(AbstractFile sourceFile) throws TextExtractorException { ReadContentInputStream stream = new ReadContentInputStream(sourceFile); //Parse the stream with Jericho and put the results in a Reader @@ -159,7 +159,7 @@ class HtmlTextExtractor extends FileTextExtractor { // All done, now make it a reader return new StringReader(stringBuilder.toString()); } catch (IOException ex) { - throw new Ingester.IngesterException("Error extracting HTML from content.", ex); + throw new TextExtractorException("Error extracting HTML from content.", ex); } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index 9b9ffa689f..a4b36ef651 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -19,7 +19,6 @@ package org.sleuthkit.autopsy.keywordsearch; import java.io.BufferedReader; -import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.logging.Level; @@ -29,6 +28,8 @@ import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.datamodel.ContentUtils; import org.sleuthkit.autopsy.ingest.IngestJobContext; +import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk; +import org.sleuthkit.autopsy.keywordsearch.TextExtractor.TextExtractorException; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.DerivedFile; @@ -170,10 +171,11 @@ class Ingester { throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex); } } - } catch (IOException ex) { + } catch (TextExtractorException ex) { extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS return false; - } catch (Exception ex) { + } //NON-NLS + catch (Exception ex) { extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS return false; } finally { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java index 23c49c255f..919510332b 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/StringsTextExtractor.java @@ -107,7 +107,7 @@ class StringsTextExtractor extends FileTextExtractor { } @Override - public InputStreamReader getReader(AbstractFile sourceFile) throws Ingester.IngesterException { + public InputStreamReader getReader(AbstractFile sourceFile) throws TextExtractorException { InputStream stringStream = getInputStream(sourceFile); return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET); } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java index 6ea27e733b..94abb940eb 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextExtractor.java @@ -30,7 +30,6 @@ import org.sleuthkit.datamodel.SleuthkitVisitableItem; */ interface TextExtractor< TextSource extends SleuthkitVisitableItem> { - /** * Is this extractor configured such that no extraction will/should be done? * @@ -46,7 +45,6 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> { */ abstract void logWarning(String msg, Exception ex); - /** * Get a reader that over the text extracted from the given source. * @@ -57,7 +55,7 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> { * * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException */ - abstract Reader getReader(TextSource source) throws Ingester.IngesterException; + abstract Reader getReader(TextSource source) throws TextExtractorException; /** * Get the 'object' id of the given source. @@ -76,4 +74,15 @@ interface TextExtractor< TextSource extends SleuthkitVisitableItem> { * @return */ abstract String getName(TextSource source); + + class TextExtractorException extends Exception { + + public TextExtractorException(String message) { + super(message); + } + + public TextExtractorException(String message, Throwable cause) { + super(message, cause); + } + } } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java index 7ac5392016..3a494be9c6 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaTextExtractor.java @@ -36,7 +36,6 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; -import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.ReadContentInputStream; @@ -67,7 +66,7 @@ class TikaTextExtractor extends FileTextExtractor { } @Override - public Reader getReader(AbstractFile sourceFile) throws IngesterException, MissingResourceException { + public Reader getReader(AbstractFile sourceFile) throws TextExtractorException, MissingResourceException { ReadContentInputStream stream = new ReadContentInputStream(sourceFile); Metadata metadata = new Metadata(); @@ -81,12 +80,12 @@ class TikaTextExtractor extends FileTextExtractor { } catch (TimeoutException te) { final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName()); logWarning(msg, te); - throw new IngesterException(msg); + throw new TextExtractorException(msg, te); } catch (Exception ex) { KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex.getCause()); //NON-NLS final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName()); logWarning(msg, ex); - throw new IngesterException(msg, ex); + throw new TextExtractorException(msg, ex); } } From 23afb4515e7b75135483691cc859a8586ce81c9b Mon Sep 17 00:00:00 2001 From: millmanorama Date: Wed, 18 Jan 2017 13:04:50 +0100 Subject: [PATCH 5/8] refactor exception handling of Chunker --- .../autopsy/keywordsearch/Chunker.java | 24 +++++++++++-------- .../autopsy/keywordsearch/Ingester.java | 21 +++++++--------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java index 799c238b26..2583a4e4c0 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java @@ -60,7 +60,7 @@ class Chunker implements Iterator, Iterable { /** has the chunker reached the end of the Reader? If so, there are no more * chunks, and the current chunk does not need a window. */ private boolean endOfReaderReached = false; - + private Exception ex; /** * Create a Chunker that will chunk the content of the given Reader. @@ -76,9 +76,18 @@ class Chunker implements Iterator, Iterable { return this; } + boolean hasException() { + return ex != null; + } + + public Exception getException() { + return ex; + } + @Override public boolean hasNext() { - return endOfReaderReached == false; + return (ex == null) + && (endOfReaderReached == false); } /** @@ -102,7 +111,7 @@ class Chunker implements Iterator, Iterable { @Override public Chunk next() { - if (endOfReaderReached) { + if (hasNext() == false) { throw new NoSuchElementException("There are no more chunks."); } //reset state for the next chunk @@ -115,14 +124,9 @@ class Chunker implements Iterator, Iterable { readBaseChunk(); baseChunkSizeChars = currentChunk.length(); readWindow(); - - } catch (IOException ioEx) { - throw new RuntimeException("IOException while reading chunk.", ioEx); - } - try { reader.unread(currentWindow.toString().toCharArray()); - } catch (IOException ex) { - throw new RuntimeException("IOException while resetting chunk reader.", ex); + } catch (IOException ioEx) { + ex = ioEx; } if (endOfReaderReached) { diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java index a4b36ef651..0dc221356c 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Ingester.java @@ -29,7 +29,6 @@ import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.datamodel.ContentUtils; import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk; -import org.sleuthkit.autopsy.keywordsearch.TextExtractor.TextExtractorException; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.BlackboardArtifact; import org.sleuthkit.datamodel.DerivedFile; @@ -145,8 +144,8 @@ class Ingester { int numChunks = 0; //unknown until chunking is done if (extractor.isDisabled()) { - /* some Extrctors, notable the strings extractor, have options which - * can be configured such that no extraction should be done */ + /* some Extractors, notable the strings extractor, have options + * which can be configured such that no extraction should be done */ return true; } @@ -167,15 +166,13 @@ class Ingester { + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS throw ingEx; //need to rethrow to signal error and move on - } catch (Exception ex) { - throw new IngesterException(String.format("Error ingesting (indexing) file chunk: %s", chunkId), ex); } } - } catch (TextExtractorException ex) { - extractor.logWarning("Unable to read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS - return false; - } //NON-NLS - catch (Exception ex) { + if (chunker.hasException()) { + extractor.logWarning("Error chunking content from " + sourceID + ": " + sourceName, chunker.getException()); + return false; + } + } catch (Exception ex) { extractor.logWarning("Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS return false; } finally { @@ -189,7 +186,7 @@ class Ingester { } /** - * Add one chunk as to the Solr index as a seperate sold document. + * Add one chunk as to the Solr index as a separate Solr document. * * TODO see if can use a byte or string streaming way to add content to * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr @@ -229,7 +226,7 @@ class Ingester { uncommitedIngests = true; } catch (KeywordSearchModuleException ex) { - //JMTODO: does this need to ne internationalized? + //JMTODO: does this need to be internationalized? throw new IngesterException( NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex); } From 851c0721eddbababf6f3b3a62e556d9a89f1ea5d Mon Sep 17 00:00:00 2001 From: millmanorama Date: Thu, 19 Jan 2017 12:30:03 +0100 Subject: [PATCH 6/8] correct size calculations --- .../autopsy/keywordsearch/Chunker.java | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java index 2583a4e4c0..ef1d06d98f 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java @@ -133,6 +133,11 @@ class Chunker implements Iterator, Iterable { /* if we have reached the end of the content,we won't make another * overlapping chunk, so the base chunk can be extended to the end. */ baseChunkSizeChars = currentChunk.length(); + System.out.println("chunksize: " + baseChunkSizeChars); + System.out.println("no window"); + } else { + System.out.println("chunksize: " + baseChunkSizeChars); + System.out.println("window size: " + (currentChunk.length() - baseChunkSizeChars)); } //sanitize the text and return a Chunk object, that includes the base chunk length. return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars); @@ -145,10 +150,12 @@ class Chunker implements Iterator, Iterable { */ private void readBaseChunk() throws IOException { //read the chunk until the minimum base chunk size - readHelper(MINIMUM_BASE_CHUNK_SIZE - 1024, currentChunk); - //keep reading until the maximum base chunk size or white space is reached. + readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk); + System.out.println("base chunk 1: " + chunkSizeBytes); - readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE - 1024, currentChunk); + //keep reading until the maximum base chunk size or white space is reached. + readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk); + System.out.println("base chunk 2: " + chunkSizeBytes); } @@ -159,11 +166,15 @@ class Chunker implements Iterator, Iterable { */ private void readWindow() throws IOException { //read the window, leaving some room to look for white space to break at. - int windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE - 1024, chunkSizeBytes + 1024); - readHelper(windowEnd, currentWindow); +// int windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, chunkSizeBytes + 1024); + readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow); + System.out.println("window chunk 1: " + chunkSizeBytes); + //keep reading until the max chunk size, or until whitespace is reached. - windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - 1024, chunkSizeBytes + 1024); - readToWhiteSpaceHelper(windowEnd, currentWindow); +// windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE, chunkSizeBytes + 1024); + readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow); + System.out.println("window chunk 2: " + chunkSizeBytes); + } private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException { @@ -193,7 +204,7 @@ class Chunker implements Iterator, Iterable { if (chunkSizeBytes + segmentSize < maxBytes) { currentSegment.append(chunkSegment); - chunkSizeBytes = currentSegment.toString().getBytes(StandardCharsets.UTF_8).length; + chunkSizeBytes += segmentSize; } else { reader.unread(tempChunkBuf, 0, charsRead); return; @@ -221,7 +232,7 @@ class Chunker implements Iterator, Iterable { charsRead = reader.read(tempChunkBuf, 0, 1); if (charsRead == -1) { currentSegment.append(ch); - chunkSizeBytes = currentSegment.toString().getBytes(StandardCharsets.UTF_8).length; + chunkSizeBytes += new Character(ch).toString().getBytes(StandardCharsets.UTF_8).length; //this is the last chunk endOfReaderReached = true; return; @@ -237,7 +248,7 @@ class Chunker implements Iterator, Iterable { currentSegment.append(chunkSegment); /* this is wrong once we are in the white space but should have * no negative effect */ - chunkSizeBytes = currentSegment.toString().getBytes(StandardCharsets.UTF_8).length; + chunkSizeBytes += chunkSegment.getBytes(StandardCharsets.UTF_8).length; } } } From d490dfc44bd62bd811c81531a4f99928b65c72ea Mon Sep 17 00:00:00 2001 From: millmanorama Date: Thu, 19 Jan 2017 16:11:37 +0100 Subject: [PATCH 7/8] fix window overlapping --- .../autopsy/keywordsearch/Chunker.java | 37 +++++++------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java index ef1d06d98f..9f3a99d5e5 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java @@ -124,21 +124,19 @@ class Chunker implements Iterator, Iterable { readBaseChunk(); baseChunkSizeChars = currentChunk.length(); readWindow(); - reader.unread(currentWindow.toString().toCharArray()); + if (endOfReaderReached) { + /* if we have reached the end of the content,we won't make + * another overlapping chunk, so the length of the base chunk + * can be extended * to the end. */ + baseChunkSizeChars = currentChunk.length(); + } else { + reader.unread(currentWindow.toString().toCharArray()); + } } catch (IOException ioEx) { ex = ioEx; } - - if (endOfReaderReached) { - /* if we have reached the end of the content,we won't make another - * overlapping chunk, so the base chunk can be extended to the end. */ - baseChunkSizeChars = currentChunk.length(); - System.out.println("chunksize: " + baseChunkSizeChars); - System.out.println("no window"); - } else { - System.out.println("chunksize: " + baseChunkSizeChars); - System.out.println("window size: " + (currentChunk.length() - baseChunkSizeChars)); - } + //add the window text to the current chunk. + currentChunk.append(currentWindow); //sanitize the text and return a Chunk object, that includes the base chunk length. return new Chunk(sanitizeToUTF8(currentChunk), baseChunkSizeChars); } @@ -151,12 +149,9 @@ class Chunker implements Iterator, Iterable { private void readBaseChunk() throws IOException { //read the chunk until the minimum base chunk size readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk); - System.out.println("base chunk 1: " + chunkSizeBytes); //keep reading until the maximum base chunk size or white space is reached. readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk); - System.out.println("base chunk 2: " + chunkSizeBytes); - } /** @@ -166,15 +161,10 @@ class Chunker implements Iterator, Iterable { */ private void readWindow() throws IOException { //read the window, leaving some room to look for white space to break at. -// int windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, chunkSizeBytes + 1024); readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow); - System.out.println("window chunk 1: " + chunkSizeBytes); //keep reading until the max chunk size, or until whitespace is reached. -// windowEnd = Math.min(MAX_TOTAL_CHUNK_SIZE, chunkSizeBytes + 1024); readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow); - System.out.println("window chunk 2: " + chunkSizeBytes); - } private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException { @@ -229,7 +219,7 @@ class Chunker implements Iterator, Iterable { final char ch = tempChunkBuf[0]; String chunkSegment; if (Character.isHighSurrogate(ch)) { - charsRead = reader.read(tempChunkBuf, 0, 1); + charsRead = reader.read(tempChunkBuf, 1, 1); if (charsRead == -1) { currentSegment.append(ch); chunkSizeBytes += new Character(ch).toString().getBytes(StandardCharsets.UTF_8).length; @@ -237,7 +227,7 @@ class Chunker implements Iterator, Iterable { endOfReaderReached = true; return; } else { - chunkSegment = new String(new char[]{ch, tempChunkBuf[0]}); + chunkSegment = new String(tempChunkBuf, 0, 2); } } else { chunkSegment = new String(tempChunkBuf, 0, 1); @@ -246,8 +236,7 @@ class Chunker implements Iterator, Iterable { whitespaceFound = Character.isWhitespace(chunkSegment.codePointAt(0)); //add read chars to the chunk and update the length. currentSegment.append(chunkSegment); - /* this is wrong once we are in the white space but should have - * no negative effect */ + chunkSizeBytes += chunkSegment.getBytes(StandardCharsets.UTF_8).length; } } From 363b5c2cce4f96a4943de26169ee3a704e1a747d Mon Sep 17 00:00:00 2001 From: millmanorama Date: Thu, 19 Jan 2017 16:56:43 +0100 Subject: [PATCH 8/8] comments and some minor refactoring --- .../sleuthkit/autopsy/casemodule/Case.java | 8 +- .../CoordinationService.java | 253 +++++++++--------- .../CoordinationServiceNamespace.java | 4 +- .../autoingest/AutoIngestJobLogger.java | 2 +- .../autoingest/AutoIngestManager.java | 2 +- .../configuration/SharedConfiguration.java | 4 +- .../autopsy/keywordsearch/Chunker.java | 111 +++++--- 7 files changed, 209 insertions(+), 175 deletions(-) diff --git a/Core/src/org/sleuthkit/autopsy/casemodule/Case.java b/Core/src/org/sleuthkit/autopsy/casemodule/Case.java index 3beac08648..d681e81b48 100644 --- a/Core/src/org/sleuthkit/autopsy/casemodule/Case.java +++ b/Core/src/org/sleuthkit/autopsy/casemodule/Case.java @@ -1008,7 +1008,7 @@ public class Case implements SleuthkitCase.ErrorObserver { // The shared lock needs to be created on a special thread so it can be released // from the same thread. Future future = getCurrentCaseExecutor().submit(() -> { - currentCaseLock = CoordinationService.getServiceForNamespace(CoordinationServiceNamespace.getRoot()).tryGetSharedLock(CoordinationService.CategoryNode.CASES, caseDir); + currentCaseLock = CoordinationService.getInstance(CoordinationServiceNamespace.getRoot()).tryGetSharedLock(CoordinationService.CategoryNode.CASES, caseDir); if (null == currentCaseLock) { throw new CaseActionException(NbBundle.getMessage(Case.class, "Case.exception.errorLocking", CaseMetadata.getFileExtension())); } @@ -1019,7 +1019,7 @@ public class Case implements SleuthkitCase.ErrorObserver { // The exclusive lock uses the unique case name. // This lock does not need to be on a special thread since it will be released before // leaving this method - exclusiveResourceLock = CoordinationService.getServiceForNamespace(CoordinationServiceNamespace.getRoot()).tryGetExclusiveLock(CoordinationService.CategoryNode.RESOURCE, + exclusiveResourceLock = CoordinationService.getInstance(CoordinationServiceNamespace.getRoot()).tryGetExclusiveLock(CoordinationService.CategoryNode.RESOURCE, dbName, 12, TimeUnit.HOURS); if (null == exclusiveResourceLock) { throw new CaseActionException(NbBundle.getMessage(Case.class, "Case.exception.errorLocking", CaseMetadata.getFileExtension())); @@ -1269,7 +1269,7 @@ public class Case implements SleuthkitCase.ErrorObserver { // The shared lock needs to be created on a special thread so it can be released // from the same thread. Future future = getCurrentCaseExecutor().submit(() -> { - currentCaseLock = CoordinationService.getServiceForNamespace(CoordinationServiceNamespace.getRoot()).tryGetSharedLock(CoordinationService.CategoryNode.CASES, metadata.getCaseDirectory()); + currentCaseLock = CoordinationService.getInstance(CoordinationServiceNamespace.getRoot()).tryGetSharedLock(CoordinationService.CategoryNode.CASES, metadata.getCaseDirectory()); if (null == currentCaseLock) { throw new CaseActionException(NbBundle.getMessage(Case.class, "Case.exception.errorLocking", CaseMetadata.getFileExtension())); } @@ -1280,7 +1280,7 @@ public class Case implements SleuthkitCase.ErrorObserver { // The exclusive lock uses the unique case name // This lock does not need to be on a special thread since it will be released before // leaving this method - exclusiveResourceLock = CoordinationService.getServiceForNamespace(CoordinationServiceNamespace.getRoot()).tryGetExclusiveLock(CoordinationService.CategoryNode.RESOURCE, + exclusiveResourceLock = CoordinationService.getInstance(CoordinationServiceNamespace.getRoot()).tryGetExclusiveLock(CoordinationService.CategoryNode.RESOURCE, metadata.getCaseDatabaseName(), 12, TimeUnit.HOURS); if (null == exclusiveResourceLock) { throw new CaseActionException(NbBundle.getMessage(Case.class, "Case.exception.errorLocking", CaseMetadata.getFileExtension())); diff --git a/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationService.java b/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationService.java index adc89d4afa..39c5c25ba6 100644 --- a/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationService.java +++ b/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationService.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2011-2017 Basis Technology Corp. + * Copyright 2015 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,89 +18,129 @@ */ package org.sleuthkit.autopsy.coordinationservice; -import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.concurrent.TimeUnit; +import org.apache.zookeeper.CreateMode; +import org.apache.zookeeper.ZooDefs; import org.apache.curator.RetryPolicy; +import org.apache.curator.retry.ExponentialBackoffRetry; import org.apache.curator.framework.CuratorFramework; import org.apache.curator.framework.CuratorFrameworkFactory; import org.apache.curator.framework.recipes.locks.InterProcessMutex; import org.apache.curator.framework.recipes.locks.InterProcessReadWriteLock; -import org.apache.curator.retry.ExponentialBackoffRetry; -import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; -import org.apache.zookeeper.KeeperException.NoNodeException; -import org.apache.zookeeper.WatchedEvent; -import org.apache.zookeeper.ZooDefs; -import org.apache.zookeeper.ZooKeeper; import org.sleuthkit.autopsy.core.UserPreferences; +import java.io.IOException; +import org.apache.zookeeper.WatchedEvent; +import org.apache.zookeeper.ZooKeeper; +import org.apache.zookeeper.KeeperException.NoNodeException; /** - * A coordination service for maintaining configuration information and - * providing distributed synchronization using a shared hierarchical namespace - * of nodes. - * - * TODO (JIRA 2205): Simple refactoring for general use. + * A centralized service for maintaining configuration information and providing + * distributed synchronization using a shared hierarchical namespace of nodes. */ public final class CoordinationService { + /** + * Category nodes are the immediate children of the root node of a shared + * hierarchical namespace managed by the coordination service. + */ + public enum CategoryNode { // RJCTODO: Move this to CoordinationServiceNamespace + + CASES("cases"), + MANIFESTS("manifests"), + CONFIG("config"), + RESOURCE("resource"); + + private final String displayName; + + private CategoryNode(String displayName) { + this.displayName = displayName; + } + + public String getDisplayName() { + return displayName; + } + } + + /** + * Exception type thrown by the coordination service. + */ + public final static class CoordinationServiceException extends Exception { + + private static final long serialVersionUID = 1L; + + private CoordinationServiceException(String message) { + super(message); + } + + private CoordinationServiceException(String message, Throwable cause) { + super(message, cause); + } + } + + /** + * An opaque encapsulation of a lock for use in distributed synchronization. + * Instances are obtained by calling a get lock method and must be passed to + * a release lock method. + */ + public static class Lock implements AutoCloseable { + + /** + * This implementation uses the Curator read/write lock. see + * http://curator.apache.org/curator-recipes/shared-reentrant-read-write-lock.html + */ + private final InterProcessMutex interProcessLock; + private final String nodePath; + + private Lock(String nodePath, InterProcessMutex lock) { + this.nodePath = nodePath; + this.interProcessLock = lock; + } + + public String getNodePath() { + return nodePath; + } + + public void release() throws CoordinationServiceException { + try { + this.interProcessLock.release(); + } catch (Exception ex) { + throw new CoordinationServiceException(String.format("Failed to release the lock on %s", nodePath), ex); + } + } + + @Override + public void close() throws CoordinationServiceException { + release(); + } + } + private static CuratorFramework curator = null; private static final Map rootNodesToServices = new HashMap<>(); + private final Map categoryNodeToPath = new HashMap<>(); private static final int SESSION_TIMEOUT_MILLISECONDS = 300000; private static final int CONNECTION_TIMEOUT_MILLISECONDS = 300000; private static final int ZOOKEEPER_SESSION_TIMEOUT_MILLIS = 3000; private static final int ZOOKEEPER_CONNECTION_TIMEOUT_MILLIS = 15000; - private static final int PORT_OFFSET = 1000; // When run in Solr, ZooKeeper defaults to Solr port + 1000 - private final Map categoryNodeToPath = new HashMap<>(); + private static final int PORT_OFFSET = 1000; /** - * Determines if ZooKeeper is accessible with the current settings. Closes - * the connection prior to returning. - * - * @return true if a connection was achieved, false otherwise - * - * @throws InterruptedException - * @throws IOException - */ - private static boolean isZooKeeperAccessible() throws InterruptedException, IOException { - boolean result = false; - Object workerThreadWaitNotifyLock = new Object(); - int zooKeeperServerPort = Integer.valueOf(UserPreferences.getIndexingServerPort()) + PORT_OFFSET; - String connectString = UserPreferences.getIndexingServerHost() + ":" + zooKeeperServerPort; - ZooKeeper zooKeeper = new ZooKeeper(connectString, ZOOKEEPER_SESSION_TIMEOUT_MILLIS, - (WatchedEvent event) -> { - synchronized (workerThreadWaitNotifyLock) { - workerThreadWaitNotifyLock.notify(); - } - }); - synchronized (workerThreadWaitNotifyLock) { - workerThreadWaitNotifyLock.wait(ZOOKEEPER_CONNECTION_TIMEOUT_MILLIS); - } - ZooKeeper.States state = zooKeeper.getState(); - if (state == ZooKeeper.States.CONNECTED || state == ZooKeeper.States.CONNECTEDREADONLY) { - result = true; - } - zooKeeper.close(); - return result; - } - - /** - * Gets a coordination service for a specific namespace. + * Gets an instance of the centralized coordination service for a specific + * namespace. * * @param rootNode The name of the root node that defines the namespace. * - * @return The coordination service. + * @return The service for the namespace defined by the root node name. * - * @throws CoordinationServiceException If an instance of the coordination + * @throws CoordinationServiceException If an instaNce of the coordination * service cannot be created. */ - public static synchronized CoordinationService getServiceForNamespace(String rootNode) throws CoordinationServiceException { - /* - * Connect to ZooKeeper via Curator. - */ + public static synchronized CoordinationService getInstance(String rootNode) throws CoordinationServiceException { if (null == curator) { RetryPolicy retryPolicy = new ExponentialBackoffRetry(1000, 3); + // When run in Solr, ZooKeeper defaults to Solr port + 1000 int zooKeeperServerPort = Integer.valueOf(UserPreferences.getIndexingServerPort()) + PORT_OFFSET; String connectString = UserPreferences.getIndexingServerHost() + ":" + zooKeeperServerPort; curator = CuratorFrameworkFactory.newClient(connectString, SESSION_TIMEOUT_MILLISECONDS, CONNECTION_TIMEOUT_MILLISECONDS, retryPolicy); @@ -117,7 +157,7 @@ public final class CoordinationService { CoordinationService service; try { service = new CoordinationService(rootNode); - } catch (IOException | InterruptedException | KeeperException | CoordinationServiceException ex) { + } catch (Exception ex) { curator = null; throw new CoordinationServiceException("Failed to create coordination service", ex); } @@ -127,18 +167,15 @@ public final class CoordinationService { } /** - * Constructs an instance of the coordination service for a specific - * namespace. + * Constructs an instance of the centralized coordination service for a + * specific namespace. * * @param rootNodeName The name of the root node that defines the namespace. - * - * @throws Exception (calls Curator methods that throw Exception instead of - * more specific exceptions) */ - private CoordinationService(String rootNodeName) throws InterruptedException, IOException, KeeperException, CoordinationServiceException { + private CoordinationService(String rootNodeName) throws Exception { if (false == isZooKeeperAccessible()) { - throw new CoordinationServiceException("Unable to access ZooKeeper"); + throw new Exception("Unable to access ZooKeeper"); } String rootNode = rootNodeName; @@ -154,8 +191,6 @@ public final class CoordinationService { if (ex.code() != KeeperException.Code.NODEEXISTS) { throw ex; } - } catch (Exception ex) { - throw new CoordinationServiceException("Curator experienced an error", ex); } categoryNodeToPath.put(node.getDisplayName(), nodePath); } @@ -350,77 +385,35 @@ public final class CoordinationService { } /** - * Exception type thrown by the coordination service. + * Determines if ZooKeeper is accessible with the current settings. Closes + * the connection prior to returning. + * + * @return true if a connection was achieved, false otherwise */ - public final static class CoordinationServiceException extends Exception { + private static boolean isZooKeeperAccessible() { + boolean result = false; + Object workerThreadWaitNotifyLock = new Object(); + int zooKeeperServerPort = Integer.valueOf(UserPreferences.getIndexingServerPort()) + PORT_OFFSET; + String connectString = UserPreferences.getIndexingServerHost() + ":" + zooKeeperServerPort; - private static final long serialVersionUID = 1L; + try { + ZooKeeper zooKeeper = new ZooKeeper(connectString, ZOOKEEPER_SESSION_TIMEOUT_MILLIS, + (WatchedEvent event) -> { - private CoordinationServiceException(String message) { - super(message); - } - - private CoordinationServiceException(String message, Throwable cause) { - super(message, cause); - } - } - - /** - * An opaque encapsulation of a lock for use in distributed synchronization. - * Instances are obtained by calling a get lock method and must be passed to - * a release lock method. - */ - public static class Lock implements AutoCloseable { - - /** - * This implementation uses the Curator read/write lock. see - * http://curator.apache.org/curator-recipes/shared-reentrant-read-write-lock.html - */ - private final InterProcessMutex interProcessLock; - private final String nodePath; - - private Lock(String nodePath, InterProcessMutex lock) { - this.nodePath = nodePath; - this.interProcessLock = lock; - } - - public String getNodePath() { - return nodePath; - } - - public void release() throws CoordinationServiceException { - try { - this.interProcessLock.release(); - } catch (Exception ex) { - throw new CoordinationServiceException(String.format("Failed to release the lock on %s", nodePath), ex); + synchronized (workerThreadWaitNotifyLock) { + workerThreadWaitNotifyLock.notify(); + } + }); + synchronized (workerThreadWaitNotifyLock) { + workerThreadWaitNotifyLock.wait(ZOOKEEPER_CONNECTION_TIMEOUT_MILLIS); } + ZooKeeper.States state = zooKeeper.getState(); + if (state == ZooKeeper.States.CONNECTED || state == ZooKeeper.States.CONNECTEDREADONLY) { + result = true; + } + zooKeeper.close(); + } catch (InterruptedException | IOException ignored) { } - - @Override - public void close() throws CoordinationServiceException { - release(); - } - } - - /** - * Category nodes are the immediate children of the root node of a shared - * hierarchical namespace managed by a coordination service. - */ - public enum CategoryNode { - - CASES("cases"), - MANIFESTS("manifests"), - CONFIG("config"), - RESOURCE("resource"); - - private final String displayName; - - private CategoryNode(String displayName) { - this.displayName = displayName; - } - - public String getDisplayName() { - return displayName; - } + return result; } } diff --git a/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationServiceNamespace.java b/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationServiceNamespace.java index 567dd38bc6..e1f2a3df42 100644 --- a/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationServiceNamespace.java +++ b/Core/src/org/sleuthkit/autopsy/coordinationservice/CoordinationServiceNamespace.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2016-2017 Basis Technology Corp. + * Copyright 2015 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,7 +19,7 @@ package org.sleuthkit.autopsy.coordinationservice; /** - * Root node for Autopsy coordination service namespace. + * Namespace elements for auto ingest coordination service nodes. */ public final class CoordinationServiceNamespace { private static final String ROOT = "autopsy"; diff --git a/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestJobLogger.java b/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestJobLogger.java index e9406950fd..3165ad23c9 100644 --- a/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestJobLogger.java +++ b/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestJobLogger.java @@ -431,7 +431,7 @@ final class AutoIngestJobLogger { * log file. */ private void log(MessageCategory category, String message) throws AutoIngestJobLoggerException, InterruptedException { - try (Lock lock = CoordinationService.getServiceForNamespace(CoordinationServiceNamespace.getRoot()).tryGetExclusiveLock(CoordinationService.CategoryNode.CASES, getLogPath(caseDirectoryPath).toString(), LOCK_TIME_OUT, LOCK_TIME_OUT_UNIT)) { + try (Lock lock = CoordinationService.getInstance(CoordinationServiceNamespace.getRoot()).tryGetExclusiveLock(CoordinationService.CategoryNode.CASES, getLogPath(caseDirectoryPath).toString(), LOCK_TIME_OUT, LOCK_TIME_OUT_UNIT)) { if (null != lock) { File logFile = getLogPath(caseDirectoryPath).toFile(); try (PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter(logFile, logFile.exists())), true)) { diff --git a/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestManager.java b/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestManager.java index bb55281227..867ae631e8 100644 --- a/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestManager.java +++ b/Experimental/src/org/sleuthkit/autopsy/experimental/autoingest/AutoIngestManager.java @@ -232,7 +232,7 @@ public final class AutoIngestManager extends Observable implements PropertyChang void startUp() throws AutoIngestManagerStartupException { SYS_LOGGER.log(Level.INFO, "Auto ingest starting"); try { - coordinationService = CoordinationService.getServiceForNamespace(CoordinationServiceNamespace.getRoot()); + coordinationService = CoordinationService.getInstance(CoordinationServiceNamespace.getRoot()); } catch (CoordinationServiceException ex) { throw new AutoIngestManagerStartupException("Failed to get coordination service", ex); } diff --git a/Experimental/src/org/sleuthkit/autopsy/experimental/configuration/SharedConfiguration.java b/Experimental/src/org/sleuthkit/autopsy/experimental/configuration/SharedConfiguration.java index 653fc65807..7599ed4648 100644 --- a/Experimental/src/org/sleuthkit/autopsy/experimental/configuration/SharedConfiguration.java +++ b/Experimental/src/org/sleuthkit/autopsy/experimental/configuration/SharedConfiguration.java @@ -160,7 +160,7 @@ public class SharedConfiguration { File remoteFolder = getSharedFolder(); - try (Lock writeLock = CoordinationService.getServiceForNamespace(LOCK_ROOT).tryGetExclusiveLock(CoordinationService.CategoryNode.CONFIG, remoteFolder.getAbsolutePath(), 30, TimeUnit.MINUTES)) { + try (Lock writeLock = CoordinationService.getInstance(LOCK_ROOT).tryGetExclusiveLock(CoordinationService.CategoryNode.CONFIG, remoteFolder.getAbsolutePath(), 30, TimeUnit.MINUTES)) { if (writeLock == null) { logger.log(Level.INFO, String.format("Failed to lock %s - another node is currently uploading or downloading configuration", remoteFolder.getAbsolutePath())); return SharedConfigResult.LOCKED; @@ -230,7 +230,7 @@ public class SharedConfiguration { File remoteFolder = getSharedFolder(); - try (Lock readLock = CoordinationService.getServiceForNamespace(LOCK_ROOT).tryGetSharedLock(CoordinationService.CategoryNode.CONFIG, remoteFolder.getAbsolutePath(), 30, TimeUnit.MINUTES)) { + try (Lock readLock = CoordinationService.getInstance(LOCK_ROOT).tryGetSharedLock(CoordinationService.CategoryNode.CONFIG, remoteFolder.getAbsolutePath(), 30, TimeUnit.MINUTES)) { if (readLock == null) { return SharedConfigResult.LOCKED; } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java index 9f3a99d5e5..3386472d03 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java @@ -1,7 +1,20 @@ /* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. + * Autopsy Forensic Browser + * + * Copyright 2011-2016 Basis Technology Corp. + * Contact: carrier sleuthkit org + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.sleuthkit.autopsy.keywordsearch; @@ -46,20 +59,13 @@ class Chunker implements Iterator, Iterable { private final PushbackReader reader; /** The local buffer of characters read from the Reader. */ private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE]; - /** number of chars read in the most recent read operation. */ - private int charsRead = 0; - /** The text of the current chunk (so far). */ - private StringBuilder currentChunk; - private StringBuilder currentWindow; /** the size in bytes of the chunk (so far). */ private int chunkSizeBytes = 0; - /** the size in chars of the (base) chunk (so far). */ - private int baseChunkSizeChars; - - /** has the chunker reached the end of the Reader? If so, there are no more + /** Has the chunker reached the end of the Reader? If so, there are no more * chunks, and the current chunk does not need a window. */ private boolean endOfReaderReached = false; + /** Store any exception encountered reading from the Reader. */ private Exception ex; /** @@ -68,6 +74,7 @@ class Chunker implements Iterator, Iterable { * @param reader The content to chunk. */ Chunker(Reader reader) { + //Using MAX_TOTAL_CHUNK_SIZE is safe but probably overkill. this.reader = new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE); } @@ -76,10 +83,18 @@ class Chunker implements Iterator, Iterable { return this; } + /** + * Has this Chunker encountered an exception reading from the Reader. + */ boolean hasException() { return ex != null; } + /** + * Get the exception encountered reading from the Reader. + * + * @return The exception, or null if no exception was encountered. + */ public Exception getException() { return ex; } @@ -115,24 +130,28 @@ class Chunker implements Iterator, Iterable { throw new NoSuchElementException("There are no more chunks."); } //reset state for the next chunk - currentChunk = new StringBuilder(); - currentWindow = new StringBuilder(); + chunkSizeBytes = 0; - baseChunkSizeChars = 0; + int baseChunkSizeChars = 0; + StringBuilder currentChunk = new StringBuilder(); + StringBuilder currentWindow = new StringBuilder(); try { - readBaseChunk(); - baseChunkSizeChars = currentChunk.length(); - readWindow(); + currentChunk.append(readBaseChunk()); + baseChunkSizeChars = currentChunk.length(); //save the base chunk length + currentWindow.append(readWindow()); if (endOfReaderReached) { /* if we have reached the end of the content,we won't make * another overlapping chunk, so the length of the base chunk - * can be extended * to the end. */ + * can be extended to the end. */ baseChunkSizeChars = currentChunk.length(); } else { + /* otherwise we will make another chunk, so unread the window */ reader.unread(currentWindow.toString().toCharArray()); } - } catch (IOException ioEx) { + } catch (Exception ioEx) { + /* Save the exception, which will cause hasNext() to return false, + * and break any chunking loop in client code. */ ex = ioEx; } //add the window text to the current chunk. @@ -142,34 +161,46 @@ class Chunker implements Iterator, Iterable { } /** - * Read the base chunk from the reader, and attempt to break at whitespace. + * Read the base chunk from the reader, attempting to break at whitespace. * * @throws IOException if there is a problem reading from the reader. */ - private void readBaseChunk() throws IOException { + private StringBuilder readBaseChunk() throws IOException { + StringBuilder currentChunk = new StringBuilder(); //read the chunk until the minimum base chunk size readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk); //keep reading until the maximum base chunk size or white space is reached. readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk); + return currentChunk; } /** - * Read the window from the reader, and attempt to break at whitespace. + * Read the window from the reader, attempting to break at whitespace. * * @throws IOException if there is a problem reading from the reader. */ - private void readWindow() throws IOException { + private StringBuilder readWindow() throws IOException { + StringBuilder currentWindow = new StringBuilder(); //read the window, leaving some room to look for white space to break at. readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow); //keep reading until the max chunk size, or until whitespace is reached. readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow); + return currentWindow; } + /** + * Read until the maxBytes reached, or end of reader. + * + * @param maxBytes + * @param currentSegment + * + * @throws IOException + */ private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException { - - //read chars up to maxBytes, or we reach the end of the reader. + int charsRead = 0; + //read chars up to maxBytes, or the end of the reader. while ((chunkSizeBytes < maxBytes) && (endOfReaderReached == false)) { charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE); @@ -178,24 +209,25 @@ class Chunker implements Iterator, Iterable { endOfReaderReached = true; return; } else { - //if the last charcter might be part of a surroate pair, unread it. + //if the last char might be part of a surroate pair, unread it. final char lastChar = tempChunkBuf[charsRead - 1]; - String chunkSegment; if (Character.isHighSurrogate(lastChar)) { charsRead--; - chunkSegment = new String(tempChunkBuf, 0, charsRead); reader.unread(lastChar); - } else { - chunkSegment = new String(tempChunkBuf, 0, charsRead); } - //add read chars to the chunk and update the length. + String chunkSegment = new String(tempChunkBuf, 0, charsRead); + + //get the length in bytes of the read chars int segmentSize = chunkSegment.getBytes(StandardCharsets.UTF_8).length; + //if it will not put us past maxBytes if (chunkSizeBytes + segmentSize < maxBytes) { + //add it to the chunk currentSegment.append(chunkSegment); chunkSizeBytes += segmentSize; } else { + //unread it, and break out of read loop. reader.unread(tempChunkBuf, 0, charsRead); return; } @@ -203,7 +235,16 @@ class Chunker implements Iterator, Iterable { } } + /** + * Read until the maxBytes reached, whitespace, or end of reader. + * + * @param maxBytes + * @param currentSegment + * + * @throws IOException + */ private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentSegment) throws IOException { + int charsRead = 0; boolean whitespaceFound = false; //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader. while ((chunkSizeBytes < maxBytes) @@ -221,22 +262,23 @@ class Chunker implements Iterator, Iterable { if (Character.isHighSurrogate(ch)) { charsRead = reader.read(tempChunkBuf, 1, 1); if (charsRead == -1) { + //this is the last chunk, so include the unpaired surrogate currentSegment.append(ch); chunkSizeBytes += new Character(ch).toString().getBytes(StandardCharsets.UTF_8).length; - //this is the last chunk endOfReaderReached = true; return; } else { + //use the surrogate pair in place of the unpaired surrogate. chunkSegment = new String(tempChunkBuf, 0, 2); } } else { + //one char chunkSegment = new String(tempChunkBuf, 0, 1); } //check for whitespace. whitespaceFound = Character.isWhitespace(chunkSegment.codePointAt(0)); //add read chars to the chunk and update the length. currentSegment.append(chunkSegment); - chunkSizeBytes += chunkSegment.getBytes(StandardCharsets.UTF_8).length; } } @@ -265,5 +307,4 @@ class Chunker implements Iterator, Iterable { return chunksize; } } - }