Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

196 lines
4.4 KiB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
  1. <?php
  2. namespace App\Jobs;
  3. use App\Ingest\Convertor;
  4. use App\Ingest\MDConvertor;
  5. use App\Parser\ParseXml;
  6. use App\Parser\DocxParser\ParseDocx;
  7. use App\Parser\HtmlParser\ParseHtml;
  8. use App\Parser\ParseHtmlArray;
  9. use App\Parser\ParseTextArray;
  10. use Illuminate\Bus\Queueable;
  11. use Illuminate\Contracts\Queue\ShouldQueue;
  12. use Illuminate\Foundation\Bus\Dispatchable;
  13. use Illuminate\Queue\InteractsWithQueue;
  14. use Illuminate\Support\Facades\Log;
  15. use Illuminate\Support\Facades\Storage;
  16. class IngestDocuments implements ShouldQueue
  17. {
  18. use Dispatchable, InteractsWithQueue, Queueable;
  19. private $path;
  20. protected $type;
  21. /**
  22. * @var \Illuminate\Contracts\Filesystem\Filesystem
  23. */
  24. private $storage;
  25. /**
  26. * @var \App\Parser\DocxParser\ParseDocx
  27. */
  28. private $parserDocx;
  29. /**
  30. * @var \App\Parser\ParseXml
  31. */
  32. private $parserXml;
  33. /**
  34. * @var \App\Parser\HtmlParser\ParseHtml
  35. */
  36. private $parserHtml;
  37. /**
  38. * @var \App\Parser\ParseHtmlArray
  39. */
  40. private $parseHtmlArray;
  41. /**
  42. * Create a new job instance.
  43. *
  44. * @param string $path
  45. * @param $type
  46. */
  47. public function __construct(string $path, $type)
  48. {
  49. $this->path = $path;
  50. $this->type = $type;
  51. $this->storage = Storage::disk('local');
  52. $this->parserDocx = new ParseDocx();
  53. $this->parserXml = new ParseXml();
  54. $this->parserHtml = new ParseHtml();
  55. $this->parseHtmlArray = new ParseHtmlArray();
  56. }
  57. /**
  58. * Execute the job.
  59. *
  60. * @return void
  61. */
  62. public function handle()
  63. {
  64. $convertor = new Convertor($this->path, $this->type);
  65. try {
  66. $this->path = $convertor->execute();
  67. } catch (\Exception $exception) {
  68. \Illuminate\Support\Facades\Log::info($exception->getMessage());
  69. $this->failed();
  70. return;
  71. }
  72. // @TODO Replace later, the convertor will create the .md file.
  73. if ($this->type !== 'pdf') {
  74. $content = $this->getContent();
  75. if ( ! $content) {
  76. $this->failed();
  77. return;
  78. }
  79. $content = $this->convertToUTF8($content);
  80. try {
  81. $filePath = $this->storeContent($content);
  82. } catch (\Exception $e) {
  83. Log::error('Error writing in to the file: ' . $e->getMessage());
  84. // report($e);
  85. }
  86. } else {
  87. $filePath = $this->path;
  88. }
  89. SendToCore::dispatch($filePath);
  90. }
  91. public function failed()
  92. {
  93. if ( ! $this->storage) {
  94. $this->storage = Storage::disk('local');
  95. }
  96. Log::error('Ingest documents failed.');
  97. // // @TODO Delete docx, txt and md files.
  98. // if ($this->storage->exists($this->path)) {
  99. // $this->storage->delete($this->path);
  100. // }
  101. SendToCore::dispatch($this->path, true);
  102. }
  103. protected function getContent()
  104. {
  105. if ($this->type === 'pdf') {
  106. // Wait while it finishes.
  107. while (!$this->storage->exists($this->path)) {
  108. sleep(1);
  109. }
  110. $textParser = new ParseTextArray(true);
  111. return $textParser->fromFile($this->storage->path($this->path));
  112. }
  113. $textParser = new ParseTextArray();
  114. return $textParser->fromFile($this->storage->path($this->path));
  115. }
  116. protected function convertToUTF8($content)
  117. {
  118. array_walk_recursive(
  119. $content,
  120. function (&$entry) {
  121. $entry = mb_convert_encoding(
  122. $entry,
  123. 'UTF-8'
  124. );
  125. }
  126. );
  127. return $content;
  128. }
  129. protected function storeContent($content)
  130. {
  131. $result = explode('.', $this->path);
  132. $name = $result[0];
  133. // Or json?
  134. $filePath = $this->storeMD($name, $content);
  135. // Delete converted file. We now have the .md file.
  136. $this->storage->delete($this->path);
  137. return $filePath;
  138. }
  139. protected function storeMD($name, $content)
  140. {
  141. $fileName = "$name.md";
  142. $convertor = new MDConvertor($content);
  143. $this->storage->put($fileName, $convertor->execute());
  144. return $fileName;
  145. }
  146. protected function storeJson($name, $content)
  147. {
  148. $fileName = "$name.json";
  149. $this->storage->put($fileName, $content);
  150. return $fileName;
  151. }
  152. }