Repo for the search and displace core module including the interface to select files and search and displace operations to run on them. https://searchanddisplace.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

275 lines
7.5 KiB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
  1. <?php
  2. namespace App\SearchDisplace;
  3. use App\SearchDisplace\Ingest\SendDataToRecreateDocument;
  4. use App\SearchDisplace\Ingest\SendDocument;
  5. use Illuminate\Http\File;
  6. use Illuminate\Http\UploadedFile;
  7. use Illuminate\Support\Facades\Storage;
  8. class SearchAndDisplaceOriginalDocument
  9. {
  10. /**
  11. *
  12. * @throws \Exception
  13. */
  14. public function start($document, $searchers)
  15. {
  16. $storeResultAtPath = '';
  17. if ($document instanceof UploadedFile) {
  18. $fileName = pathinfo($document->getClientOriginalName(), PATHINFO_FILENAME);
  19. } else {
  20. // From CLI.
  21. $document = new File($document);
  22. $fileName = str_replace('.' . $document->getExtension(), '', $document->getFilename());
  23. $storeResultAtPath = $document->getPath();
  24. }
  25. $id = time() . '_' . $fileName;
  26. $this->storeSearchers($id, $searchers, $storeResultAtPath);
  27. $this->sendDocumentToIngest($id, $document, $fileName);
  28. return $id;
  29. }
  30. /**
  31. * @param $id
  32. * @param $contents
  33. * @param $documentFormat
  34. * @throws \GuzzleHttp\Exception\GuzzleException
  35. */
  36. public function applySD($id, $contents, $documentFormat)
  37. {
  38. $data = json_decode($contents['document'], true);
  39. try {
  40. $searchAndDisplace = new SearchAndDisplace(
  41. $data['contents']['text'],
  42. [
  43. 'searchers' => $this->getSearchers($id),
  44. ]
  45. );
  46. $result = $searchAndDisplace->execute();
  47. // Update text.
  48. $x = $this->applyResultsOnIngestData($data['contents'], $result);
  49. $data['contents'] = $x;
  50. $data['document_format'] = $documentFormat;
  51. $this->sendDataToIngestToRebuild($id, $data);
  52. } catch (\Exception $exception) {
  53. \Illuminate\Support\Facades\Log::info('========================');
  54. \Illuminate\Support\Facades\Log::info('Exception - SearchAndDisplaceOriginalDocument@applySD');
  55. \Illuminate\Support\Facades\Log::info($exception->getMessage());
  56. \Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
  57. \Illuminate\Support\Facades\Log::info('========================');
  58. }
  59. }
  60. public function onIngestFail($id)
  61. {
  62. $storage = Storage::disk('local');
  63. $directory = "contracts/$id";
  64. $storage->deleteDirectory($directory);
  65. }
  66. public function hasFailed($id)
  67. {
  68. $storage = Storage::disk('local');
  69. $basePath = "contracts/$id";
  70. if ($storage->exists($basePath)) {
  71. return false;
  72. }
  73. if ($this->findDocumentById($id) !== null) {
  74. return false;
  75. }
  76. return true;
  77. }
  78. public function isInProgress($id)
  79. {
  80. $storage = Storage::disk('local');
  81. $basePath = "contracts/$id";
  82. return $storage->exists($basePath) && $this->findDocumentById($id) === null;
  83. }
  84. /**
  85. * @param $id
  86. * @return string
  87. * @throws \Exception
  88. */
  89. public function getDownloadPath($id)
  90. {
  91. $storage = Storage::disk('local');
  92. // @TODO Improve this. Right now we are calling 'findDocumentById' multiple times.
  93. if ($this->hasFailed($id)) {
  94. throw new \Exception('Document has failed.');
  95. }
  96. if ($this->isInProgress($id)) {
  97. throw new \Exception('Document is still processing.');
  98. }
  99. return $storage->path($this->findDocumentById($id));
  100. }
  101. protected function findDocumentById($id)
  102. {
  103. $storage = Storage::disk('local');
  104. $documentPartialPath = "contracts/$id-document";
  105. $contractFiles = $storage->files('contracts');
  106. foreach ($contractFiles as $contractFile) {
  107. if (substr($contractFile, 0, strlen($documentPartialPath)) === $documentPartialPath) {
  108. return $contractFile;
  109. }
  110. }
  111. return null;
  112. }
  113. protected function applyResultsOnIngestData($ingestData, $sdResult)
  114. {
  115. $ingestData['text'] = $sdResult['content'];
  116. // Update index ranges.
  117. $indexes = [];
  118. // Use original start for key in order to have the indexes sorted ASC.
  119. foreach ($sdResult['indexes'] as $searcher => $searcherIndexes) {
  120. foreach ($searcherIndexes as $index) {
  121. $indexes[$index['original_start']] = $index;
  122. }
  123. }
  124. $lastOffset = 0;
  125. foreach ($ingestData['elements'] as $elementIndex => $element) {
  126. $currentOffset = 0;
  127. foreach ($indexes as $i => $index) {
  128. if ($index['original_start'] > $element['range_end']) {
  129. break;
  130. }
  131. if ($index['original_end'] < $element['range_start']) {
  132. continue;
  133. }
  134. if (
  135. $index['original_start'] >= $element['range_start'] &&
  136. $index['original_end'] <= $element['range_end']
  137. ) {
  138. $endDifference = ($index['end'] - $index['original_end']) -
  139. ($index['start'] - $index['original_start']);
  140. $ingestData['elements'][$elementIndex]['range_end'] += $endDifference;
  141. $currentOffset += $endDifference;
  142. unset($indexes[$i]);
  143. }
  144. }
  145. $ingestData['elements'][$elementIndex]['range_start'] += $lastOffset;
  146. $ingestData['elements'][$elementIndex]['range_end'] += $lastOffset;
  147. $lastOffset += $currentOffset;
  148. }
  149. return $ingestData;
  150. }
  151. protected function storeSearchers($id, $searchers, $storeResultAtPath)
  152. {
  153. $storage = Storage::disk('local');
  154. $directory = "contracts/$id";
  155. $storage->makeDirectory($directory);
  156. $storage->put("$directory/searchers.json", json_encode([
  157. 'searchers' => $searchers,
  158. 'document_path' => $storeResultAtPath ?? '',
  159. ]));
  160. }
  161. /**
  162. *
  163. * @param $id
  164. * @return string
  165. * @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
  166. */
  167. protected function getSearchers($id)
  168. {
  169. $storage = Storage::disk('local');
  170. $directory = "contracts/$id";
  171. $searchers = $storage->get("$directory/searchers.json");
  172. if ( ! $searchers) {
  173. throw new \Exception('Searchers do not exist.');
  174. }
  175. return json_decode($searchers, true)['searchers'];
  176. }
  177. public function getStoreAtPathFromJsonFile($id)
  178. {
  179. $storage = Storage::disk('local');
  180. $directory = "contracts/$id";
  181. $searchers = $storage->get("$directory/searchers.json");
  182. if ( ! $searchers) {
  183. throw new \Exception('Searchers do not exist.');
  184. }
  185. return json_decode($searchers, true)['document_path'];
  186. }
  187. /**
  188. *
  189. * @param $id
  190. * @param $document
  191. * @throws \Exception
  192. */
  193. protected function sendDocumentToIngest($id, $document, $fileName)
  194. {
  195. $sendDocument = new SendDocument();
  196. $sendDocument->execute($id, [
  197. 'path' => $document->getRealPath(),
  198. 'type' => $document->getMimeType(),
  199. 'name' => $fileName
  200. ], 'original');
  201. }
  202. /**
  203. *
  204. * @param $id
  205. * @param $data
  206. * @throws \GuzzleHttp\Exception\GuzzleException
  207. */
  208. protected function sendDataToIngestToRebuild($id, $data)
  209. {
  210. $handler = new SendDataToRecreateDocument();
  211. $handler->execute($id, $data);
  212. }
  213. }