Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

193 lines
4.5 KiB

3 years ago
3 years ago
3 years ago
  1. <?php
  2. namespace App\Ingest;
  3. use Illuminate\Support\Facades\Storage;
  4. use Symfony\Component\Process\Exception\ProcessFailedException;
  5. use Symfony\Component\Process\Process;
  6. class Convertor
  7. {
  8. /**
  9. * @var \Illuminate\Contracts\Filesystem\Filesystem
  10. */
  11. private $storage;
  12. private $path;
  13. protected $type;
  14. public function __construct($path, $type)
  15. {
  16. $this->storage = Storage::disk('local');
  17. $this->path = $path;
  18. $this->type = $type;
  19. }
  20. public function execute()
  21. {
  22. if ($this->type === 'txt') {
  23. return $this->path;
  24. }
  25. if ($this->type === 'pdf') {
  26. $this->convertPdfToText();
  27. return $this->path;
  28. }
  29. if ($this->type !== 'docx') {
  30. $this->convertToDocx();
  31. }
  32. $this->convertDocumentToText();
  33. //$this->convertToHtml();
  34. return $this->path;
  35. }
  36. /**
  37. * Convert doc,dot,rtf,odt,pdf,docx to docx
  38. *
  39. *
  40. * @return string|void
  41. */
  42. private function convertToDocx()
  43. {
  44. (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
  45. /**
  46. * Convert doc,dot,rtf,odt to docx
  47. */
  48. $process = new Process([
  49. 'soffice',
  50. '--headless',
  51. '--convert-to',
  52. 'docx',
  53. $this->storage->path($this->path),
  54. '--outdir',
  55. $this->storage->path('contracts')
  56. ]);
  57. $process->run();
  58. if (!$process->isSuccessful()) {
  59. throw new ProcessFailedException($process);
  60. }
  61. $this->storage->delete($this->path);
  62. $this->path = str_replace($this->type, 'docx', $this->path);
  63. }
  64. /**
  65. * Convert docx file to text
  66. *
  67. *
  68. * @return string|void
  69. */
  70. private function convertDocumentToText()
  71. {
  72. (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
  73. $process = new Process([
  74. 'soffice',
  75. '--headless',
  76. '--convert-to',
  77. 'txt',
  78. $this->storage->path($this->path),
  79. '--outdir',
  80. $this->storage->path('contracts')
  81. ]);
  82. $process->run();
  83. if (!$process->isSuccessful()) {
  84. throw new ProcessFailedException($process);
  85. }
  86. $this->storage->delete($this->path);
  87. $this->path = str_replace(['.docx', '.bin'], '.txt', $this->path);
  88. }
  89. private function convertPdfToText()
  90. {
  91. (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
  92. $process = new Process([
  93. 'pip3',
  94. 'install',
  95. 'pdftotext',
  96. ]);
  97. $process->run();
  98. if (!$process->isSuccessful()) {
  99. throw new ProcessFailedException($process);
  100. }
  101. $process = new Process([
  102. 'python3',
  103. storage_path('scripts' . DIRECTORY_SEPARATOR . 'parse-pdf.py'),
  104. '-i',
  105. $this->storage->path($this->path),
  106. '-o',
  107. $this->storage->path(str_replace('.pdf', '.txt', $this->path))
  108. ]);
  109. $process->run();
  110. if (!$process->isSuccessful()) {
  111. throw new ProcessFailedException($process);
  112. }
  113. $this->storage->delete($this->path);
  114. $this->path = str_replace('pdf', 'txt', $this->path);
  115. }
  116. private function convertToHtml()
  117. {
  118. (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
  119. $process = new Process([
  120. 'soffice',
  121. '--headless',
  122. '--convert-to',
  123. 'html:HTML:EmbedImages',
  124. $this->storage->path($this->path),
  125. '--outdir',
  126. $this->storage->path('contracts')
  127. ]);
  128. $process->run();
  129. if (!$process->isSuccessful()) {
  130. throw new ProcessFailedException($process);
  131. }
  132. $this->storage->delete($this->path);
  133. $this->path = str_replace($this->type, 'html', $this->path);
  134. }
  135. private function convertToXML()
  136. {
  137. //Convert the file to xml using pdftohtml to xml and run a python scrypt to fix the paragraphs
  138. $process = new Process([
  139. 'pdftohtml',
  140. '-xml',
  141. '-i',
  142. $this->storage->path($this->path)
  143. ]);
  144. $process->run();
  145. if (!$process->isSuccessful()) {
  146. throw new ProcessFailedException($process);
  147. }
  148. $this->storage->delete($this->path);
  149. $this->path = str_replace($this->type, 'xml', $this->path);
  150. }
  151. }