prepareForConvertPDF(); $contents = $this->getFileContents(); if ( ! $contents) { throw new \Exception('Cannot get pdf file contents.'); } $this->storage->put("$this->directoryPath/document.html", $contents); } protected function getFileContents() { $outputPath = $this->storage->path("$this->directoryPath/html"); $process = new Process([ 'pdftohtml', '-xml', $this->storage->path($this->path), $outputPath ]); $process->run(); if (!$process->isSuccessful()) { throw new ProcessFailedException($process); } // Remove original document. $this->storage->delete($this->path); return $this->getDataFromXML(); } protected function getDataFromXML() { $xmlFilePath = "$this->directoryPath/html.xml"; $contents = $this->storage->get($xmlFilePath); $xml = simplexml_load_string($contents); $orderedList = []; $fonts = []; foreach ($xml->page as $page) { $pageNumber = (int) $page['number'][0]; $orderedList[$pageNumber] = []; foreach ($page as $p) { if ($p->getName() === 'fontspec') { $fonts[(int) $p['id']]['family'] = (string) substr($p['family'], strpos($p['family'], '+') + 1); $fonts[(int) $p['id']]['size'] = (string) $p['size']; $fonts[(int) $p['id']]['color'] = (string) $p['color']; } if (isset($p['top'])) { $top = (int) $p['top']; if ( ! array_key_exists($top, $orderedList[$pageNumber])) { $orderedList[$pageNumber][$top] = []; } $orderedList[$pageNumber][$top][] = $p; } } ksort($orderedList[$pageNumber]); } $hasImages = false; $hasText = false; $imagesCount = 0; $htmlContents = ''; try { foreach ($orderedList as $page) { $html = ''; $footerImages = []; foreach ($page as $items) { $continuousP = ''; $firstOfText = true; foreach ($items as $key => $p) { if ($p->getName() == 'image') { $imageInFooter = false; $basePath = $this->storage->path(''); $imageFilePath = str_replace($basePath, '', $p['src']); try { $textContents = $this->applyOCR($imageFilePath); } catch (TesseractOcrException $e) { # Could not get text content from image. This means the image doesn't have text. $textContents = ''; } if ($textContents) { $imageInFooter = true; if ($html) { $htmlContents = $htmlContents . $html; $html = ''; } $htmlContents = $htmlContents . "
$textContents
"; $this->storage->delete($imageFilePath); $hasText = true; } else { $hasImages = true; $imagesCount += 1; $caption = "Fig. $imagesCount"; $imageHTML = $this->handleImage($p, $caption); if (!$imageInFooter) { $html = $html . $imageHTML; } else { $html = $html . "

$caption

"; $footerImages[] = $imageHTML; } } } if ($p->getName() == 'text') { if($p == '·') { continue; } $addition = null; if(isset($items[$key-1]) && $items[$key-1] == '·') { $addition = '· '; } $continuousP = $continuousP . $this->handleText($p, $fonts, $addition, $firstOfText); $firstOfText = false; $hasText = true; } } $html = $html . '

' . $continuousP . '

'; } if (!empty($footerImages)) { foreach ($footerImages as $footerImage) { $html = $html . '

' . $footerImage . '

'; } } $htmlContents = $htmlContents . "$html"; } } catch (\Exception $exception) { $this->storage->deleteDirectory($this->directoryPath); Log::info($exception->getTraceAsString()); // throw new \Exception('Something went wrong.'); throw $exception; } if ( ! $hasText && ! $hasImages) { // Remove directory because we do not have any use for it anymore. $this->storage->deleteDirectory($this->directoryPath); } else { // Remove the unnecessary 'xml' file. $this->storage->delete($xmlFilePath); } // return $mdContents; return $htmlContents; } protected function handleImage($p, $caption) { $html = ''; $src = './contracts-images/' . pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME); $html = $html . '
'; $html = $html . '' . $caption . ''; $html = $html . '
'; $html = $html . '
'; return $html; } protected function handleText($p, $fonts, $addition = null, $firstOfText = false) { $id = (int) $p['font']; $font_size = $fonts[$id]['size']; $font_color = $fonts[$id]['color']; $font_family = $fonts[$id]['family']; $style = ''; $style = $style . 'position: relative;'; $style = $style . "color: $font_color;"; $style = $style . "font-family: $font_family;"; $style = $style . "height: " . $p['height'] . "px;"; if($firstOfText) $style = $style . "padding-left: " . (intval($p['left']) - 90) . "px;"; $style = $style . "font-size: $font_size" . "px;"; if ($p->i) { $content = '' . $p->i . ''; } else if ($p->b) { $content = '' . $p->b . ''; } else if ($p->a) { $content = $p . '' . $p->a . ''; } else { $content = $p; } if($addition) { $content = $addition . $content; } $tag = $this->getTag($font_size); return '<' . $tag . ' style="' . $style . '">' . $content . ''; } protected function getTag($size) { // @TODO Needed to bump values up by 2, the XML loader gives different results on different servers. if ($size > 26) { return 'h1'; } if ($size > 20) { return 'h2'; } if ($size > 18) { return 'h3'; } return 'span'; } protected function applyOCR($path) { $ocr = new OCR($this->storage->path($path)); return $ocr->execute(); } protected function convertHtmlToMD($contents) { $html = '' . $contents . ''; $filepath = $this->storage->path($this->directoryPath); file_put_contents($filepath . '/document.html', $html); $process = new Process([ 'pandoc', '-f', 'html', $filepath . '/document.html', '-t', 'markdown_strict', ]); $process->run(); if (!$process->isSuccessful()) { throw new ProcessFailedException($process); } unlink($filepath . '/document.html'); return $process->getOutput(); } protected function prepareForConvertPDF() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); $process = new Process([ 'pip3', 'install', 'pdftotext', ]); $process->run(); if (!$process->isSuccessful()) { throw new ProcessFailedException($process); } } }