prepareForConvertPDF(); $result = $this->getFileContents(); if ( ! $result['has_images'] && ! $result['has_text']) { throw new \Exception('Cannot get pdf file contents.'); } if ($result['has_text']) { $mdContents = ''; foreach ($result['htmls'] as $html) { $converter = new HtmlConverter(); $converter->getConfig()->setOption('strip_tags', true); $contents = $converter->convert($html); $mdContents = $mdContents . "\n\n" . $contents; } $this->storage->put("$this->directoryPath/document.md", $mdContents); return; } // Only contains images. $imagesContent = ''; $files = $this->storage->allFiles($this->path); foreach ($files as $file) { // Only get the image files from the directory, it may contain some empty html files too. // @TODO Only OCR images with text and delete them afterwards, the remaining ignore and keep. if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) { $ocr = new OCR($this->storage->path($file)); $imagesContent = $imagesContent . $ocr->execute(); $this->storage->delete($file); } } $this->storage->put("$this->directoryPath/document.md", $imagesContent); } protected function getFileContents() { $outputPath = $this->storage->path("$this->directoryPath/html"); $process = new Process([ 'pdftohtml', '-xml', $this->storage->path($this->path), $outputPath ]); $process->run(); if (!$process->isSuccessful()) { throw new ProcessFailedException($process); } // Remove original document. $this->storage->delete($this->path); return $this->getDataFromXML(); } protected function getDataFromXML() { $xmlFilePath = "$this->directoryPath/html.xml"; $contents = $this->storage->get($xmlFilePath); $xml = simplexml_load_string($contents); $orderedList = []; $fonts = []; foreach ($xml->page as $page) { $pageNumber = (int) $page['number'][0]; $orderedList[$pageNumber] = []; foreach ($page as $p) { if ($p->getName() === 'fontspec') { $fonts[(int) $p['id']]['family'] = (string) $p['family']; $fonts[(int) $p['id']]['size'] = (string) $p['size']; $fonts[(int) $p['id']]['color'] = (string) $p['color']; } if (isset($p['top'])) { $top = (int) $p['top']; if ( ! array_key_exists($top, $orderedList[$pageNumber])) { $orderedList[$pageNumber][$top] = []; } $orderedList[$pageNumber][$top][] = $p; } } ksort($orderedList[$pageNumber]); } $htmls = []; $hasImages = false; $hasText = false; $imagesCount = 0; $imagesInFooter = true; try { foreach ($orderedList as $page) { $html = ''; $footerImages = []; foreach ($page as $items) { $continuousP = ''; foreach ($items as $p) { if ($p->getName() == 'image') { $hasImages = true; $imagesCount += 1; $caption = "Fig. $imagesCount"; $imageHTML = $this->handleImage($p, $caption); if ( ! $imagesInFooter) { $html = $html . $imageHTML; } else { $html = $html . "
$caption
"; $footerImages[] = $imageHTML; } } if ($p->getName() == 'text') { $continuousP = $continuousP . $this->handleText($p, $fonts); $hasText = true; } } $html = $html . '' . $continuousP . '
'; } if ($imagesInFooter) { foreach ($footerImages as $index => $footerImage) { $html = $html . '' . $footerImage . '
'; // $html = $html . 'Fig. ' . ($index + 1) . '
'; } } $htmls[] = '