Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

174 lines
4.4 KiB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
  1. <?php
  2. namespace App\Jobs;
  3. use App\Ingest\Convertor;
  4. use App\Ingest\DataJsonConvertor;
  5. use App\Ingest\DocxReader;
  6. use App\Parser\ParseXml;
  7. use App\Parser\DocxParser\ParseDocx;
  8. use App\Parser\HtmlParser\ParseHtml;
  9. use App\Parser\ParseHtmlArray;
  10. use Illuminate\Bus\Queueable;
  11. use Illuminate\Contracts\Queue\ShouldQueue;
  12. use Illuminate\Foundation\Bus\Dispatchable;
  13. use Illuminate\Queue\InteractsWithQueue;
  14. use Illuminate\Support\Carbon;
  15. use Illuminate\Support\Facades\Log;
  16. use Illuminate\Support\Facades\Redis;
  17. use Illuminate\Support\Facades\Storage;
  18. class IngestDocuments implements ShouldQueue
  19. {
  20. use Dispatchable, InteractsWithQueue, Queueable;
  21. protected $id;
  22. protected $fileResultType;
  23. protected $path;
  24. protected $type;
  25. protected $fromRequest;
  26. /**
  27. * @var \Illuminate\Contracts\Filesystem\Filesystem
  28. */
  29. private $storage;
  30. /**
  31. * Create a new job instance.
  32. *
  33. * @param $id
  34. * @param $fileResultType
  35. * @param string $path
  36. * @param $type
  37. * @param $fromRequest
  38. */
  39. public function __construct($id, $fileResultType, string $path, $type, $fromRequest)
  40. {
  41. $this->id = $id;
  42. $this->fileResultType = $fileResultType;
  43. $this->path = $path;
  44. $this->type = $type;
  45. $this->fromRequest = $fromRequest;
  46. $this->storage = Storage::disk('local');
  47. }
  48. /**
  49. * Execute the job.
  50. *
  51. * @return void
  52. */
  53. public function handle()
  54. {
  55. try {
  56. $this->execute();
  57. } catch (\Exception $exception) {
  58. \Illuminate\Support\Facades\Log::info('=============== IngestDocuments@handle');
  59. \Illuminate\Support\Facades\Log::info($exception->getMessage());
  60. \Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
  61. \Illuminate\Support\Facades\Log::info('=============== ');
  62. $this->failed();
  63. return;
  64. }
  65. $directoryPath = pathinfo($this->path, PATHINFO_DIRNAME);
  66. if ($this->fromRequest) {
  67. SendToCore::dispatch($this->id, $this->fileResultType, $this->type, $directoryPath);
  68. return;
  69. }
  70. $this->storage->deleteDirectory($directoryPath);
  71. $this->updateAnalyzer();
  72. }
  73. protected function execute()
  74. {
  75. if ($this->fileResultType === 'md') {
  76. $this->convertToMD();
  77. return;
  78. }
  79. $this->convertToJsonData();
  80. }
  81. /**
  82. * Convert document to plain MD file which is easy to work with.
  83. *
  84. * @throws \Exception
  85. */
  86. protected function convertToMD()
  87. {
  88. $convertor = new Convertor($this->path, $this->type);
  89. $convertor->execute();
  90. }
  91. /**
  92. * Convert document to JSON data file.
  93. *
  94. * @throws \Exception
  95. */
  96. protected function convertToJsonData()
  97. {
  98. $convertor = new DataJsonConvertor($this->path, $this->type);
  99. $convertor->execute();
  100. }
  101. public function failed()
  102. {
  103. if ( ! $this->storage) {
  104. $this->storage = Storage::disk('local');
  105. }
  106. Log::error('Ingest documents failed. ' . $this->path);
  107. $directoryPath = pathinfo($this->path, PATHINFO_DIRNAME);
  108. if ($this->fromRequest) {
  109. SendToCore::dispatch($this->id, $this->fileResultType, $this->type, $directoryPath, true);
  110. return;
  111. }
  112. $this->storage->deleteDirectory($directoryPath);
  113. $this->updateAnalyzer(true);
  114. }
  115. protected function updateAnalyzer($failed = false)
  116. {
  117. $redis = Redis::connection();
  118. if ($failed) {
  119. $redis->set('analyze_performance_error', '1');
  120. }
  121. $remainingFiles = $redis->get('analyze_performance_remaining_files');
  122. $remainingFiles -= 1;
  123. if ($remainingFiles === 0) {
  124. $startedAt = $redis->get('analyze_performance_time');
  125. $endedAt = Carbon::now()->format('U');
  126. $directoryPath = $redis->get('analyze_performance_path');
  127. $data = 'Time elapsed in seconds: ' . ($endedAt - $startedAt) . "\n";
  128. if ($failed) {
  129. $data = $data . 'Something went wrong while processing the files.';
  130. }
  131. file_put_contents($directoryPath . '/ingest_analyze_performance.txt', $data);
  132. return;
  133. }
  134. $redis->set('analyze_performance_remaining_files', $remainingFiles);
  135. }
  136. }