Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

509 lines
14 KiB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
  1. <?php
  2. namespace App\Ingest;
  3. use Illuminate\Support\Facades\Storage;
  4. use Symfony\Component\Process\Exception\ProcessFailedException;
  5. use Symfony\Component\Process\Process;
  6. use League\HTMLToMarkdown\HtmlConverter;
  7. class Convertor
  8. {
  9. /**
  10. * @var \Illuminate\Contracts\Filesystem\Filesystem
  11. */
  12. private $storage;
  13. private $path;
  14. protected $type;
  15. public function __construct($path, $type)
  16. {
  17. $this->storage = Storage::disk('local');
  18. $this->path = $path;
  19. $this->type = $type;
  20. }
  21. /**
  22. * @return mixed
  23. * @throws \Exception
  24. */
  25. public function execute()
  26. {
  27. if ($this->type === 'txt') {
  28. return $this->path;
  29. }
  30. if ($this->type === 'pdf') {
  31. // $this->convertPdfToText();
  32. $this->convertPdfToMD();
  33. // $this->getHtmlContentsFromPdfWithImages();
  34. return $this->path;
  35. }
  36. if ($this->type !== 'docx') {
  37. $this->convertToDocx();
  38. }
  39. $this->convertDocumentToText();
  40. //$this->convertToHtml();
  41. return $this->path;
  42. }
  43. /**
  44. * Convert doc,dot,rtf,odt,pdf,docx to docx
  45. *
  46. *
  47. * @return string|void
  48. */
  49. private function convertToDocx()
  50. {
  51. (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
  52. /**
  53. * Convert doc,dot,rtf,odt to docx
  54. */
  55. $process = new Process([
  56. 'soffice',
  57. '--headless',
  58. '--convert-to',
  59. 'docx',
  60. $this->storage->path($this->path),
  61. '--outdir',
  62. $this->storage->path('contracts')
  63. ]);
  64. $process->run();
  65. if (!$process->isSuccessful()) {
  66. throw new ProcessFailedException($process);
  67. }
  68. $this->storage->delete($this->path);
  69. $this->path = str_replace(".$this->type", '.docx', $this->path);
  70. }
  71. /**
  72. * Convert docx file to text
  73. *
  74. * @return void
  75. */
  76. private function convertDocumentToText()
  77. {
  78. (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
  79. $process = new Process([
  80. 'soffice',
  81. '--headless',
  82. '--convert-to',
  83. 'txt',
  84. $this->storage->path($this->path),
  85. '--outdir',
  86. $this->storage->path('contracts')
  87. ]);
  88. $process->run();
  89. if (!$process->isSuccessful()) {
  90. throw new ProcessFailedException($process);
  91. }
  92. $this->storage->delete($this->path);
  93. $this->path = str_replace(['.docx', '.bin'], '.txt', $this->path);
  94. }
  95. protected function convertPdfToText()
  96. {
  97. $this->prepareForConvertPDF();
  98. $images = $this->getImagesFromPDF();
  99. $contents = $this->getTextContentsFromPDF();
  100. if (!$contents && count($images) === 0) {
  101. throw new \Exception('Could not read from file.');
  102. }
  103. // Handle images and image contents.
  104. if (count($images) > 0) {
  105. foreach ($images as $image) {
  106. try {
  107. $ocr = new OCR($this->storage->path($image));
  108. $imageContents = $ocr->execute();
  109. $contents = $contents . "\n" . $imageContents;
  110. } catch (\Exception $exception) {
  111. \Illuminate\Support\Facades\Log::info('something wrong: ' . $exception->getMessage());
  112. }
  113. }
  114. $dir = str_replace('.pdf', '', $this->path);
  115. $this->storage->deleteDirectory($dir);
  116. }
  117. $this->storage->delete($this->path);
  118. $this->path = str_replace('.pdf', '.txt', $this->path);
  119. $this->storage->put($this->path, $contents);
  120. }
  121. protected function convertPdfToMD()
  122. {
  123. // $this->prepareForConvertPDF();
  124. $result = $this->getContentsFromPdf();
  125. if ( ! $result['has_images'] && ! $result['has_text']) {
  126. throw new \Exception('Cannot get pdf file contents.');
  127. }
  128. if ($result['has_text']) {
  129. if ($result['has_images']) {
  130. // Both text and images.
  131. throw new \Exception('Not supported for now.');
  132. }
  133. // Delete directory because the contents are in the '$result' variable.
  134. $this->storage->deleteDirectory($this->path);
  135. $mdContents = '';
  136. foreach ($result['htmls'] as $html) {
  137. $converter = new HtmlConverter();
  138. $converter->getConfig()->setOption('strip_tags', true);
  139. $contents = $converter->convert($html);
  140. $mdContents = $mdContents . $contents;
  141. }
  142. $this->path = "$this->path.md";
  143. $this->storage->put($this->path, $mdContents);
  144. return;
  145. }
  146. // Only contains images.
  147. $imagesContent = '';
  148. $files = $this->storage->allFiles($this->path);
  149. foreach ($files as $file) {
  150. // Only get the image files from the directory, it may contain some empty html files too.
  151. if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) {
  152. $ocr = new OCR($this->storage->path($file));
  153. $imagesContent = $imagesContent . $ocr->execute();
  154. }
  155. }
  156. \Illuminate\Support\Facades\Log::info('============================');
  157. \Illuminate\Support\Facades\Log::info($this->path);
  158. // We are done with the images processing, delete directory.
  159. $this->storage->deleteDirectory($this->path);
  160. $this->path = "$this->path.md";
  161. \Illuminate\Support\Facades\Log::info($this->path);
  162. \Illuminate\Support\Facades\Log::info('++++++++++++++++++++++++++');
  163. $this->storage->put($this->path, $imagesContent);
  164. }
  165. private function convertToHtml()
  166. {
  167. (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
  168. $process = new Process([
  169. 'soffice',
  170. '--headless',
  171. '--convert-to',
  172. 'html:HTML:EmbedImages',
  173. $this->storage->path($this->path),
  174. '--outdir',
  175. $this->storage->path('contracts')
  176. ]);
  177. $process->run();
  178. if (!$process->isSuccessful()) {
  179. throw new ProcessFailedException($process);
  180. }
  181. $this->storage->delete($this->path);
  182. $this->path = str_replace(".$this->type", '.html', $this->path);
  183. }
  184. private function convertToXML()
  185. {
  186. //Convert the file to xml using pdftohtml to xml and run a python scrypt to fix the paragraphs
  187. $process = new Process([
  188. 'pdftohtml',
  189. '-xml',
  190. '-i',
  191. $this->storage->path($this->path)
  192. ]);
  193. $process->run();
  194. if (!$process->isSuccessful()) {
  195. throw new ProcessFailedException($process);
  196. }
  197. $this->storage->delete($this->path);
  198. $this->path = str_replace(".$this->type", '.xml', $this->path);
  199. }
  200. protected function prepareForConvertPDF()
  201. {
  202. (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
  203. $process = new Process([
  204. 'pip3',
  205. 'install',
  206. 'pdftotext',
  207. ]);
  208. $process->run();
  209. if (!$process->isSuccessful()) {
  210. throw new ProcessFailedException($process);
  211. }
  212. }
  213. protected function getImagesFromPDF()
  214. {
  215. $dir = str_replace('.pdf', '', $this->path);
  216. $this->storage->makeDirectory($dir);
  217. $process = new Process([
  218. 'pdfimages',
  219. '-p',
  220. $this->storage->path($this->path),
  221. '-tiff',
  222. $this->storage->path("$dir/ocr")
  223. ]);
  224. $process->run();
  225. if (!$process->isSuccessful()) {
  226. throw new ProcessFailedException($process);
  227. }
  228. return $this->storage->allFiles($dir);
  229. }
  230. protected function getTextContentsFromPDF()
  231. {
  232. $outputPath = $this->storage->path(str_replace('.pdf', '.txt', $this->path));
  233. $process = new Process([
  234. 'python3',
  235. storage_path('scripts' . DIRECTORY_SEPARATOR . 'parse-pdf.py'),
  236. '-i',
  237. $this->storage->path($this->path),
  238. '-o',
  239. $outputPath
  240. ]);
  241. $process->run();
  242. if (!$process->isSuccessful()) {
  243. throw new ProcessFailedException($process);
  244. }
  245. return file_get_contents($outputPath);
  246. }
  247. protected function getHtmlContentsFromPdfWithImages()
  248. {
  249. $dirName = str_replace('.pdf', '', $this->path);
  250. $this->storage->makeDirectory($dirName);
  251. $outputPath = $this->storage->path("$dirName/html");
  252. $process = new Process([
  253. 'pdftohtml',
  254. '-noframes',
  255. $this->storage->path($this->path),
  256. $outputPath
  257. ]);
  258. $process->run();
  259. if (!$process->isSuccessful()) {
  260. throw new ProcessFailedException($process);
  261. }
  262. $this->storage->delete($this->path);
  263. $this->path = $dirName;
  264. $converter = new HtmlConverter();
  265. $converter->getConfig()->setOption('strip_tags', true);
  266. $files = $this->storage->allFiles($this->path);
  267. $htmlFileIndex = null;
  268. foreach ($files as $index => $file) {
  269. // if (pathinfo($file, PATHINFO_BASENAME) === 'html-html.html') {
  270. // if (pathinfo($file, PATHINFO_EXTENSION) === 'html') {
  271. if (pathinfo($file, PATHINFO_BASENAME) === 'html.html') {
  272. $htmlFileIndex = $index;
  273. break;
  274. }
  275. }
  276. $htmlContents = $this->storage->get($files[$htmlFileIndex]);
  277. $contents = $converter->convert($htmlContents);
  278. // $this->storage->deleteDirectory($this->path);
  279. $this->path = "$this->path.md";
  280. $this->storage->put($this->path, $contents);
  281. dd(3);
  282. }
  283. protected function getContentsFromPdf()
  284. {
  285. $dirName = str_replace('.pdf', '', $this->path);
  286. $this->storage->makeDirectory($dirName);
  287. $outputPath = $this->storage->path("$dirName/html");
  288. $process = new Process([
  289. 'pdftohtml',
  290. '-xml',
  291. $this->storage->path($this->path),
  292. $outputPath
  293. ]);
  294. $process->run();
  295. if (!$process->isSuccessful()) {
  296. throw new ProcessFailedException($process);
  297. }
  298. $this->storage->delete($this->path);
  299. $this->path = $dirName;
  300. $contents = $this->storage->get("$this->path/html.xml");
  301. $xml = simplexml_load_string($contents);
  302. $fonts = [];
  303. foreach ($xml->page as $page) {
  304. foreach ($page as $p) {
  305. if ($p->getName() === 'fontspec') {
  306. $fonts[(int) $p['id']]['family'] = (string) $p['family'];
  307. $fonts[(int) $p['id']]['size'] = (string) $p['size'];
  308. $fonts[(int) $p['id']]['color'] = (string) $p['color'];
  309. }
  310. }
  311. }
  312. $htmls = [];
  313. $hasImages = false;
  314. $hasText = false;
  315. try {
  316. foreach ($xml->page as $page) {
  317. $html = '';
  318. $previousP = null;
  319. foreach ($page as $p) {
  320. if ($p->getName() == 'image') {
  321. $html = $html . '<img style="position: absolute; top: ' . $p['top'] . 'px; left: ' . $p['left'] . 'px;" width="' . $p['width'] . '" height="' . $p['height'] . '" src="' . $p['src'] . '">';
  322. $hasImages = true;
  323. }
  324. if ($p->getName() == 'text') {
  325. $id = (int) $p['font'];
  326. $font_size = $fonts[$id]['size'];
  327. $font_color = $fonts[$id]['color'];
  328. $font_family = $fonts[$id]['family'];
  329. $style = '';
  330. $style = $style . 'position: absolute;';
  331. $style = $style . "color: $font_color;";
  332. $style = $style . "font-family: $font_family;";
  333. $style = $style . "font-weight: 900;";
  334. $style = $style . "width: " . $p['width'] . "px;";
  335. $style = $style . "height: " . $p['height'] . "px;";
  336. $style = $style . "top: " . $p['top'] . "px;";
  337. $style = $style . "left: " . $p['left'] . "px;";
  338. // $style = $style . "font-size: $font_size" . "px;";
  339. if ($p->i) {
  340. $content = '<i>' . $p->i . '</i>';
  341. } else if ($p->b) {
  342. $content = '<b>' . $p->b . '</b>';
  343. } else {
  344. $content = $p;
  345. }
  346. // @TODO Must chain paragraphs if top are almost same.
  347. $tag = $this->getTag($p, $previousP, $font_size);
  348. $html = $html . '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>';
  349. $hasText = true;
  350. }
  351. $previousP = $p;
  352. }
  353. $htmls[] = '<html><head><title></title></head><body>' . $html . '</body></html>';
  354. }
  355. } catch (\Exception $exception) {
  356. \Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
  357. }
  358. return [
  359. 'has_images' => $hasImages,
  360. 'has_text' => $hasText,
  361. 'htmls' => $htmls,
  362. ];
  363. }
  364. protected function getTag($p, $previousP, $size)
  365. {
  366. if ($size > 24) {
  367. return 'h1';
  368. }
  369. if ($size > 18) {
  370. return 'h2';
  371. }
  372. if ($size > 16) {
  373. return 'h3';
  374. }
  375. if ($previousP && $p['top'] - $previousP['top'] <= 5) {
  376. return 'span';
  377. }
  378. return 'p';
  379. }
  380. }