Repo for the search and displace core module including the interface to select files and search and displace operations to run on them.
https://searchanddisplace.com
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
275 lines
7.5 KiB
275 lines
7.5 KiB
<?php
|
|
|
|
namespace App\SearchDisplace;
|
|
|
|
use App\SearchDisplace\Ingest\SendDataToRecreateDocument;
|
|
use App\SearchDisplace\Ingest\SendDocument;
|
|
use Illuminate\Http\File;
|
|
use Illuminate\Http\UploadedFile;
|
|
use Illuminate\Support\Facades\Storage;
|
|
|
|
class SearchAndDisplaceOriginalDocument
|
|
{
|
|
/**
|
|
*
|
|
* @throws \Exception
|
|
*/
|
|
public function start($document, $searchers)
|
|
{
|
|
$storeResultAtPath = '';
|
|
|
|
if ($document instanceof UploadedFile) {
|
|
$fileName = pathinfo($document->getClientOriginalName(), PATHINFO_FILENAME);
|
|
} else {
|
|
// From CLI.
|
|
|
|
$document = new File($document);
|
|
|
|
$fileName = str_replace('.' . $document->getExtension(), '', $document->getFilename());
|
|
|
|
$storeResultAtPath = $document->getPath();
|
|
}
|
|
|
|
$id = time() . '_' . $fileName;
|
|
|
|
$this->storeSearchers($id, $searchers, $storeResultAtPath);
|
|
$this->sendDocumentToIngest($id, $document, $fileName);
|
|
|
|
return $id;
|
|
}
|
|
|
|
/**
|
|
* @param $id
|
|
* @param $contents
|
|
* @param $documentFormat
|
|
* @throws \GuzzleHttp\Exception\GuzzleException
|
|
*/
|
|
public function applySD($id, $contents, $documentFormat)
|
|
{
|
|
$data = json_decode($contents['document'], true);
|
|
|
|
try {
|
|
$searchAndDisplace = new SearchAndDisplace(
|
|
$data['contents']['text'],
|
|
|
|
[
|
|
'searchers' => $this->getSearchers($id),
|
|
]
|
|
);
|
|
|
|
$result = $searchAndDisplace->execute();
|
|
|
|
// Update text.
|
|
$x = $this->applyResultsOnIngestData($data['contents'], $result);
|
|
$data['contents'] = $x;
|
|
|
|
$data['document_format'] = $documentFormat;
|
|
|
|
$this->sendDataToIngestToRebuild($id, $data);
|
|
} catch (\Exception $exception) {
|
|
\Illuminate\Support\Facades\Log::info('========================');
|
|
\Illuminate\Support\Facades\Log::info('Exception - SearchAndDisplaceOriginalDocument@applySD');
|
|
\Illuminate\Support\Facades\Log::info($exception->getMessage());
|
|
\Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
|
|
\Illuminate\Support\Facades\Log::info('========================');
|
|
}
|
|
}
|
|
|
|
public function onIngestFail($id)
|
|
{
|
|
$storage = Storage::disk('local');
|
|
$directory = "contracts/$id";
|
|
|
|
$storage->deleteDirectory($directory);
|
|
}
|
|
|
|
public function hasFailed($id)
|
|
{
|
|
$storage = Storage::disk('local');
|
|
$basePath = "contracts/$id";
|
|
|
|
if ($storage->exists($basePath)) {
|
|
return false;
|
|
}
|
|
|
|
if ($this->findDocumentById($id) !== null) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
public function isInProgress($id)
|
|
{
|
|
$storage = Storage::disk('local');
|
|
$basePath = "contracts/$id";
|
|
|
|
return $storage->exists($basePath) && $this->findDocumentById($id) === null;
|
|
}
|
|
|
|
/**
|
|
* @param $id
|
|
* @return string
|
|
* @throws \Exception
|
|
*/
|
|
public function getDownloadPath($id)
|
|
{
|
|
$storage = Storage::disk('local');
|
|
|
|
// @TODO Improve this. Right now we are calling 'findDocumentById' multiple times.
|
|
|
|
if ($this->hasFailed($id)) {
|
|
throw new \Exception('Document has failed.');
|
|
}
|
|
|
|
if ($this->isInProgress($id)) {
|
|
throw new \Exception('Document is still processing.');
|
|
}
|
|
|
|
return $storage->path($this->findDocumentById($id));
|
|
}
|
|
|
|
protected function findDocumentById($id)
|
|
{
|
|
$storage = Storage::disk('local');
|
|
|
|
$documentPartialPath = "contracts/$id-document";
|
|
|
|
$contractFiles = $storage->files('contracts');
|
|
|
|
foreach ($contractFiles as $contractFile) {
|
|
if (substr($contractFile, 0, strlen($documentPartialPath)) === $documentPartialPath) {
|
|
return $contractFile;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
protected function applyResultsOnIngestData($ingestData, $sdResult)
|
|
{
|
|
$ingestData['text'] = $sdResult['content'];
|
|
|
|
// Update index ranges.
|
|
$indexes = [];
|
|
|
|
// Use original start for key in order to have the indexes sorted ASC.
|
|
foreach ($sdResult['indexes'] as $searcher => $searcherIndexes) {
|
|
foreach ($searcherIndexes as $index) {
|
|
$indexes[$index['original_start']] = $index;
|
|
}
|
|
}
|
|
|
|
$lastOffset = 0;
|
|
|
|
foreach ($ingestData['elements'] as $elementIndex => $element) {
|
|
$currentOffset = 0;
|
|
|
|
foreach ($indexes as $i => $index) {
|
|
if ($index['original_start'] > $element['range_end']) {
|
|
break;
|
|
}
|
|
|
|
if ($index['original_end'] < $element['range_start']) {
|
|
continue;
|
|
}
|
|
|
|
if (
|
|
$index['original_start'] >= $element['range_start'] &&
|
|
$index['original_end'] <= $element['range_end']
|
|
) {
|
|
$endDifference = ($index['end'] - $index['original_end']) -
|
|
($index['start'] - $index['original_start']);
|
|
|
|
$ingestData['elements'][$elementIndex]['range_end'] += $endDifference;
|
|
$currentOffset += $endDifference;
|
|
|
|
unset($indexes[$i]);
|
|
}
|
|
}
|
|
|
|
$ingestData['elements'][$elementIndex]['range_start'] += $lastOffset;
|
|
$ingestData['elements'][$elementIndex]['range_end'] += $lastOffset;
|
|
|
|
$lastOffset += $currentOffset;
|
|
}
|
|
|
|
return $ingestData;
|
|
}
|
|
|
|
protected function storeSearchers($id, $searchers, $storeResultAtPath)
|
|
{
|
|
$storage = Storage::disk('local');
|
|
$directory = "contracts/$id";
|
|
$storage->makeDirectory($directory);
|
|
|
|
$storage->put("$directory/searchers.json", json_encode([
|
|
'searchers' => $searchers,
|
|
'document_path' => $storeResultAtPath ?? '',
|
|
]));
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param $id
|
|
* @return string
|
|
* @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
|
|
*/
|
|
protected function getSearchers($id)
|
|
{
|
|
$storage = Storage::disk('local');
|
|
$directory = "contracts/$id";
|
|
|
|
$searchers = $storage->get("$directory/searchers.json");
|
|
|
|
if ( ! $searchers) {
|
|
throw new \Exception('Searchers do not exist.');
|
|
}
|
|
|
|
return json_decode($searchers, true)['searchers'];
|
|
}
|
|
|
|
public function getStoreAtPathFromJsonFile($id)
|
|
{
|
|
$storage = Storage::disk('local');
|
|
$directory = "contracts/$id";
|
|
|
|
$searchers = $storage->get("$directory/searchers.json");
|
|
|
|
if ( ! $searchers) {
|
|
throw new \Exception('Searchers do not exist.');
|
|
}
|
|
|
|
return json_decode($searchers, true)['document_path'];
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param $id
|
|
* @param $document
|
|
* @throws \Exception
|
|
*/
|
|
protected function sendDocumentToIngest($id, $document, $fileName)
|
|
{
|
|
$sendDocument = new SendDocument();
|
|
|
|
$sendDocument->execute($id, [
|
|
'path' => $document->getRealPath(),
|
|
'type' => $document->getMimeType(),
|
|
'name' => $fileName
|
|
], 'original');
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param $id
|
|
* @param $data
|
|
* @throws \GuzzleHttp\Exception\GuzzleException
|
|
*/
|
|
protected function sendDataToIngestToRebuild($id, $data)
|
|
{
|
|
$handler = new SendDataToRecreateDocument();
|
|
|
|
$handler->execute($id, $data);
|
|
}
|
|
}
|