Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

72 lines
1.6 KiB

  1. <?php
  2. namespace App\Ingest;
  3. use Illuminate\Support\Facades\Storage;
  4. class DataJsonConvertor extends AbstractConvertor
  5. {
  6. protected $type;
  7. public function __construct($path, $type)
  8. {
  9. parent::__construct(Storage::disk('local'), $path);
  10. $this->type = $type;
  11. }
  12. /**
  13. * Convert given document to JSON file which contains the document's data.
  14. *
  15. * @throws \Exception
  16. */
  17. public function execute()
  18. {
  19. // if ($this->type === 'pdf') {
  20. // $this->convertToDocx();
  21. // }
  22. if ($this->type !== 'docx') {
  23. $this->convertToDocx();
  24. }
  25. $json = $this->convertDocxToJson();
  26. $this->storage->put("$this->directoryPath/document.json", json_encode($json));
  27. $this->deleteOriginalDocument();
  28. }
  29. protected function convertDocxToJson()
  30. {
  31. $reader = new DocxReader($this->storage, $this->path);
  32. return $reader->execute();
  33. }
  34. /**
  35. * Convert document to DOCX format in order to extract data.
  36. *
  37. * @throws \Exception
  38. */
  39. protected function convertToDocx()
  40. {
  41. $office = new Office();
  42. $success = $office->run(
  43. 'docx',
  44. $this->storage->path($this->path),
  45. $this->storage->path($this->directoryPath)
  46. );
  47. if (! $success) {
  48. throw new \Exception('Failed when converting from ' . $this->type . ' to DOCX for file: ' . $this->path);
  49. }
  50. $this->deleteOriginalDocument();
  51. $this->setPath(str_replace($this->type, 'docx', $this->path));
  52. $this->type = 'docx';
  53. }
  54. }