diff --git a/app/Console/Commands/AnalyzePerformance.php b/app/Console/Commands/AnalyzePerformance.php new file mode 100644 index 0000000..d6141a2 --- /dev/null +++ b/app/Console/Commands/AnalyzePerformance.php @@ -0,0 +1,85 @@ +argument('path'); + + if ( ! is_dir($directoryPath)) { + $this->error('The path is invalid: not a directory.'); + + return; + } + + $redis = Redis::connection(); + + $redis->set('analyze_performance_time', Carbon::now()->format('U')); + $redis->set('analyze_performance_path', $directoryPath); + + $allFiles = $this->getDirContents($directoryPath); + + $redis->set('analyze_performance_remaining_files', count($allFiles)); + + foreach ($allFiles as $index => $file) { + $handler = new DocumentHandler($index, new UploadedFile($file, "File {$index}"), false); + + $handler->handle(); + } + + $this->info('Processing... When it\'s done the results will be added to the \'ingest_analyze_performance.txt\' file in the directory you have provided.'); + } + + protected function getDirContents($dir, &$results = array()) + { + $files = scandir($dir); + + foreach ($files as $key => $value) { + $path = realpath($dir . DIRECTORY_SEPARATOR . $value); + + if (!is_dir($path)) { + $results[] = $path; + } else if ($value != "." && $value != "..") { + $this->getDirContents($path, $results); + } + } + + return $results; + } +} diff --git a/app/Ingest/Convertor.php b/app/Ingest/Convertor.php index 8a204af..522cbee 100644 --- a/app/Ingest/Convertor.php +++ b/app/Ingest/Convertor.php @@ -45,22 +45,16 @@ class Convertor private function convertToHtml() { - (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); + $office = new Office(); - $process = new Process([ - 'soffice', - '--headless', - '--convert-to', + $success = $office->run( 'html:HTML:EmbedImages', $this->storage->path($this->path), - '--outdir', $this->storage->path('contracts') - ]); - - $process->run(); + ); - if (!$process->isSuccessful()) { - throw new ProcessFailedException($process); + if (! $success) { + throw new \Exception('Something went wrong while tried converting to HTML for file: ' . $this->path); } $this->storage->delete($this->path); diff --git a/app/Ingest/DocumentHandler.php b/app/Ingest/DocumentHandler.php index 38e4dfe..6d69f12 100644 --- a/app/Ingest/DocumentHandler.php +++ b/app/Ingest/DocumentHandler.php @@ -9,6 +9,7 @@ class DocumentHandler { protected $id; protected $document; + protected $fromRequest; const DOCX_MIME_TYPE = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'; const DOC_MIME_TYPE = 'application/msword'; @@ -34,18 +35,18 @@ class DocumentHandler self::PLAIN_TEXT_TYPE => 'txt', ]; - public function __construct($id, $document) + public function __construct($id, $document, $fromRequest = true) { $this->id = $id; $this->document = $document; + $this->fromRequest = $fromRequest; } public function handle() { $storage = Storage::disk('local'); - $file = request()->file('document'); - $mimeType = $file->getClientMimeType(); + $mimeType = $this->document->getClientMimeType(); if (!array_key_exists($mimeType, $this->supportedFiles)) { throw new \Exception('File not supported.'); @@ -55,8 +56,8 @@ class DocumentHandler $id = str_replace(' ', '_', $this->id); - $path = $storage->putFileAs("contracts/$id", $file, "document.$type"); + $path = $storage->putFileAs("contracts/$id", $this->document, "document.$type"); - IngestDocuments::dispatch($this->id, $path, $type); + IngestDocuments::dispatch($this->id, $path, $type, $this->fromRequest); } } diff --git a/app/Ingest/DocxConvertor.php b/app/Ingest/DocxConvertor.php index 9940453..35c862f 100644 --- a/app/Ingest/DocxConvertor.php +++ b/app/Ingest/DocxConvertor.php @@ -9,33 +9,33 @@ class DocxConvertor extends AbstractConvertor { public function execute() { -// $this->convertToText(); -// -// $convertor = new TextConvertor($this->storage, "$this->directoryPath/document.txt"); -// -// $convertor->execute(); + $this->convertToPdfWithLibreOffice(); - $this->convertToPDF(); + $pdfFilePath = "$this->directoryPath/document.pdf"; - $convertor = new PDFConvertor($this->storage, "$this->directoryPath/document.pdf"); + if ( ! $this->storage->exists($pdfFilePath)) { + throw new \Exception('Failed to convert to PDF: ' . $pdfFilePath); + } + + $convertor = new PDFConvertor($this->storage, $pdfFilePath); $convertor->execute(); } - protected function convertToText() + protected function convertToPDF() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); $process = new Process([ - 'soffice', - '--headless', - '--convert-to', - 'txt', + 'unoconv', + '-f', + 'pdf', +// '-c=socket,host=localhost,port=' . (2000 + rand(2, 7)) . ';urp;StarOffice.ComponentContext', $this->storage->path($this->path), - '--outdir', - $this->storage->path($this->directoryPath) ]); + $process->setTimeout(10); + $process->run(); if (!$process->isSuccessful()) { @@ -45,21 +45,18 @@ class DocxConvertor extends AbstractConvertor $this->deleteOriginalDocument(); } - protected function convertToPDF() + protected function convertToPdfWithLibreOffice() { - (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); + $office = new Office(); - $process = new Process([ - 'unoconv', - '-f', + $success = $office->run( 'pdf', $this->storage->path($this->path), - ]); - - $process->run(); + $this->storage->path($this->directoryPath) + ); - if (!$process->isSuccessful()) { - throw new ProcessFailedException($process); + if (! $success) { + throw new \Exception('Failed when converting from DOCX to PDF for file: ' . $this->path); } $this->deleteOriginalDocument(); diff --git a/app/Ingest/OCR.php b/app/Ingest/OCR.php index ecb8ad9..5c9483b 100644 --- a/app/Ingest/OCR.php +++ b/app/Ingest/OCR.php @@ -52,13 +52,10 @@ class OCR $directory = pathinfo($this->path, PATHINFO_DIRNAME); $newPath = "$directory/$filePath"; - $moved = File::move(base_path($filePath), $newPath); - - if ( ! $moved) { - throw new \Exception('Something went wrong while moving file.'); + // The file may not be created by the library for various reasons, including if it does not have text. + if (File::exists($newPath)) { + $this->path = $newPath; } - - $this->path = $newPath; } protected function applyDeskew() diff --git a/app/Ingest/Office.php b/app/Ingest/Office.php new file mode 100644 index 0000000..4581dfd --- /dev/null +++ b/app/Ingest/Office.php @@ -0,0 +1,62 @@ +id = uniqid(); + $this->directory = 'soffice-dir-' . $this->id; + + (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); + } + + public function run($convertTo, $filePath, $directoryPath) + { + $this->makeTemporaryDirectory(); + + $success = $this->runConversion($convertTo, $filePath, $directoryPath); + + // @TODO Does not work at the moment. +// $this->removeTemporaryDirectory(); + + return $success; + } + + protected function runConversion($convertTo, $filePath, $directoryPath) + { + $process = new Process([ + 'soffice', + '--accept="pipe,name=soffice-pipe-' . $this->id . ';urp;StarOffice.ServiceMananger"', + '-env:UserInstallation=file:///tmp/' . $this->directory, + '--headless', + '--convert-to', + $convertTo, + $filePath, + '--outdir', + $directoryPath + ]); + + $process->setTimeout(10); + + $process->run(); + + return $process->isSuccessful(); + } + + protected function makeTemporaryDirectory() + { + (new Process(['mkdir /tmp/' . $this->directory]))->run(); + } + + protected function removeTemporaryDirectory() + { + (new Process(['rm -rf /tmp/' . $this->directory]))->run(); + } +} diff --git a/app/Ingest/OtherConvertor.php b/app/Ingest/OtherConvertor.php index 53f6839..3ab91bb 100644 --- a/app/Ingest/OtherConvertor.php +++ b/app/Ingest/OtherConvertor.php @@ -24,25 +24,19 @@ class OtherConvertor extends AbstractConvertor */ private function convertToDocx() { - (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); - /** * Convert doc,dot,rtf,odt to docx */ - $process = new Process([ - 'soffice', - '--headless', - '--convert-to', + $office = new Office(); + + $success = $office->run( 'docx', $this->storage->path($this->path), - '--outdir', $this->storage->path($this->directoryPath) - ]); - - $process->run(); + ); - if (!$process->isSuccessful()) { - throw new ProcessFailedException($process); + if (! $success) { + throw new \Exception('Something went wrong while tried converting to DOCX for file: ' . $this->path); } $this->deleteOriginalDocument(); diff --git a/app/Ingest/PDFConvertor.php b/app/Ingest/PDFConvertor.php index 7abde61..c3a8a5b 100644 --- a/app/Ingest/PDFConvertor.php +++ b/app/Ingest/PDFConvertor.php @@ -12,47 +12,13 @@ class PDFConvertor extends AbstractConvertor { // $this->prepareForConvertPDF(); - $result = $this->getFileContents(); + $contents = $this->getFileContents(); - if ( ! $result['has_images'] && ! $result['has_text']) { + if ( ! $contents) { throw new \Exception('Cannot get pdf file contents.'); } - if ($result['has_text']) { - $mdContents = ''; - - foreach ($result['htmls'] as $html) { - $converter = new HtmlConverter(); - $converter->getConfig()->setOption('strip_tags', true); - - $contents = $converter->convert($html); - - $mdContents = $mdContents . "\n\n" . $contents; - } - - $this->storage->put("$this->directoryPath/document.md", $mdContents); - - return; - } - - // Only contains images. - $imagesContent = ''; - $files = $this->storage->allFiles($this->path); - - foreach ($files as $file) { - // Only get the image files from the directory, it may contain some empty html files too. - - // @TODO Only OCR images with text and delete them afterwards, the remaining ignore and keep. - if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) { - $ocr = new OCR($this->storage->path($file)); - - $imagesContent = $imagesContent . $ocr->execute(); - - $this->storage->delete($file); - } - } - - $this->storage->put("$this->directoryPath/document.md", $imagesContent); + $this->storage->put("$this->directoryPath/document.md", $contents); } protected function getFileContents() @@ -115,13 +81,14 @@ class PDFConvertor extends AbstractConvertor ksort($orderedList[$pageNumber]); } - $htmls = []; $hasImages = false; $hasText = false; $imagesCount = 0; $imagesInFooter = true; + $mdContents = ''; + try { foreach ($orderedList as $page) { $html = ''; @@ -132,19 +99,38 @@ class PDFConvertor extends AbstractConvertor foreach ($items as $p) { if ($p->getName() == 'image') { - $hasImages = true; + $basePath = $this->storage->path(''); + $imageFilePath = str_replace($basePath, '', $p['src']); - $imagesCount += 1; - $caption = "Fig. $imagesCount"; + $textContents = $this->applyOCR($imageFilePath); - $imageHTML = $this->handleImage($p, $caption); + if ($textContents) { + if ($html) { + $mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n"; - if ( ! $imagesInFooter) { - $html = $html . $imageHTML; + $html = ''; + } + + $mdContents = $mdContents . $textContents . "\n"; + + $this->storage->delete($imageFilePath); + + $hasText = true; } else { - $html = $html . "

$caption

"; + $hasImages = true; + + $imagesCount += 1; + $caption = "Fig. $imagesCount"; - $footerImages[] = $imageHTML; + $imageHTML = $this->handleImage($p, $caption); + + if ( ! $imagesInFooter) { + $html = $html . $imageHTML; + } else { + $html = $html . "

$caption

"; + + $footerImages[] = $imageHTML; + } } } @@ -161,11 +147,10 @@ class PDFConvertor extends AbstractConvertor if ($imagesInFooter) { foreach ($footerImages as $index => $footerImage) { $html = $html . '

' . $footerImage . '

'; -// $html = $html . '

Fig. ' . ($index + 1) . '

'; } } - $htmls[] = '' . $html . ''; + $mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n"; } } catch (\Exception $exception) { $this->storage->deleteDirectory($this->directoryPath); @@ -183,11 +168,7 @@ class PDFConvertor extends AbstractConvertor $this->storage->delete($xmlFilePath); } - return [ - 'has_images' => $hasImages, - 'has_text' => $hasText, - 'htmls' => $htmls, - ]; + return $mdContents; } protected function handleImage($p, $caption) @@ -254,6 +235,23 @@ class PDFConvertor extends AbstractConvertor return 'span'; } + protected function applyOCR($path) + { + $ocr = new OCR($this->storage->path($path)); + + return $ocr->execute(); + } + + protected function convertHtmlToMD($contents) + { + $html = '' . $contents . ''; + + $converter = new HtmlConverter(); + $converter->getConfig()->setOption('strip_tags', true); + + return $converter->convert($html); + } + protected function prepareForConvertPDF() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); diff --git a/app/Jobs/IngestDocuments.php b/app/Jobs/IngestDocuments.php index 1c08f87..cb654f2 100644 --- a/app/Jobs/IngestDocuments.php +++ b/app/Jobs/IngestDocuments.php @@ -11,7 +11,9 @@ use Illuminate\Bus\Queueable; use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Foundation\Bus\Dispatchable; use Illuminate\Queue\InteractsWithQueue; +use Illuminate\Support\Carbon; use Illuminate\Support\Facades\Log; +use Illuminate\Support\Facades\Redis; use Illuminate\Support\Facades\Storage; class IngestDocuments implements ShouldQueue @@ -21,6 +23,7 @@ class IngestDocuments implements ShouldQueue protected $id; private $path; protected $type; + protected $fromRequest; /** * @var \Illuminate\Contracts\Filesystem\Filesystem @@ -53,12 +56,14 @@ class IngestDocuments implements ShouldQueue * @param $id * @param string $path * @param $type + * @param $fromRequest */ - public function __construct($id, string $path, $type) + public function __construct($id, string $path, $type, $fromRequest) { $this->id = $id; $this->path = $path; $this->type = $type; + $this->fromRequest = $fromRequest; $this->storage = Storage::disk('local'); $this->parserDocx = new ParseDocx(); @@ -86,7 +91,17 @@ class IngestDocuments implements ShouldQueue return; } - SendToCore::dispatch($this->id, pathinfo($this->path, PATHINFO_DIRNAME)); + $directoryPath = pathinfo($this->path, PATHINFO_DIRNAME); + + if ($this->fromRequest) { + SendToCore::dispatch($this->id, $directoryPath); + + return; + } + + $this->storage->deleteDirectory($directoryPath); + + $this->updateAnalyzer(); } public function failed() @@ -95,13 +110,48 @@ class IngestDocuments implements ShouldQueue $this->storage = Storage::disk('local'); } - Log::error('Ingest documents failed.'); + Log::error('Ingest documents failed. ' . $this->path); + + $directoryPath = pathinfo($this->path, PATHINFO_DIRNAME); -// // @TODO Delete docx, txt and md files. -// if ($this->storage->exists($this->path)) { -// $this->storage->delete($this->path); -// } + if ($this->fromRequest) { + SendToCore::dispatch($this->id, $directoryPath, true); + + return; + } + + $this->storage->deleteDirectory($directoryPath); + + $this->updateAnalyzer(true); + } + + protected function updateAnalyzer($failed = false) + { + $redis = Redis::connection(); + + if ($failed) { + $redis->set('analyze_performance_error', '1'); + } + + $remainingFiles = $redis->get('analyze_performance_remaining_files'); + $remainingFiles -= 1; + + if ($remainingFiles === 0) { + $startedAt = $redis->get('analyze_performance_time'); + $endedAt = Carbon::now()->format('U'); + $directoryPath = $redis->get('analyze_performance_path'); + + $data = 'Time elapsed in seconds: ' . ($endedAt - $startedAt) . "\n"; + + if ($failed) { + $data = $data . 'Something went wrong while processing the files.'; + } + + file_put_contents($directoryPath . '/ingest_analyze_performance.txt', $data); + + return; + } - SendToCore::dispatch($this->id, pathinfo($this->path, PATHINFO_DIRNAME), true); + $redis->set('analyze_performance_remaining_files', $remainingFiles); } } diff --git a/resources/python/dewarp/page_dewarp.py b/resources/python/dewarp/page_dewarp.py index 222df32..c0b1e7e 100755 --- a/resources/python/dewarp/page_dewarp.py +++ b/resources/python/dewarp/page_dewarp.py @@ -785,7 +785,7 @@ def get_page_dims(corners, rough_dims, params): return dims -def remap_image(name, img, small, page_dims, params): +def remap_image(name, dirname, img, small, page_dims, params): height = 0.5 * page_dims[1] * OUTPUT_ZOOM * img.shape[0] height = round_nearest_multiple(height, REMAP_DECIMATE) @@ -833,7 +833,7 @@ def remap_image(name, img, small, page_dims, params): pil_image = pil_image.convert('1') threshfile = name + '_thresh.png' - pil_image.save(threshfile, dpi=(OUTPUT_DPI, OUTPUT_DPI)) + pil_image.save(dirname + '/' + threshfile, dpi=(OUTPUT_DPI, OUTPUT_DPI)) if DEBUG_LEVEL >= 1: height = small.shape[0] @@ -861,6 +861,7 @@ def main(): img = cv2.imread(imgfile) small = resize_to_screen(img) basename = os.path.basename(imgfile) + dirname = os.path.dirname(imgfile) name, _ = os.path.splitext(basename) print('loaded', basename, 'with size', imgsize(img), end=' ') @@ -907,7 +908,7 @@ def main(): page_dims = get_page_dims(corners, rough_dims, params) - outfile = remap_image(name, img, small, page_dims, params) + outfile = remap_image(name, dirname, img, small, page_dims, params) outfiles.append(outfile) diff --git a/resources/python/ocr/localize_text_tesseract.py b/resources/python/ocr/localize_text_tesseract.py new file mode 100644 index 0000000..63a162b --- /dev/null +++ b/resources/python/ocr/localize_text_tesseract.py @@ -0,0 +1,29 @@ +import cv2 + +image = cv2.imread("logo.jpg", 1) + +img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + +cv2.threshold(img, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU, img) +cv2.bitwise_not(img, img) + +rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 5)) + +img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, rect_kernel) +contours, hier = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + +if len(contours) != 0: + ROI_number = 0 + for c in contours: + x,y,w,h = cv2.boundingRect(c) + + # Depends on text size, so the greater the value the less objects we get. + if (h > 50): + cv2.rectangle(image, (x,y), (x+w,y+h), (0,0,255), 1) + + ROI = image[y:y+h, x:x+w] + cv2.imwrite('results/ROI_{}.png'.format(ROI_number), ROI) + ROI_number += 1 + +cv2.imshow("Result", image) +cv2.waitKey(0) diff --git a/resources/python/ocr/logo.jpg b/resources/python/ocr/logo.jpg new file mode 100644 index 0000000..90b41dd Binary files /dev/null and b/resources/python/ocr/logo.jpg differ diff --git a/resources/python/ocr/results/.gitignore b/resources/python/ocr/results/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/resources/python/ocr/results/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore