Browse Source

Performance analyzer. Apply OCR on images in digital documents

hidden_tags_with_bookmarks
Orzu Ionut 3 years ago
parent
commit
70debcfc58
  1. 85
      app/Console/Commands/AnalyzePerformance.php
  2. 16
      app/Ingest/Convertor.php
  3. 11
      app/Ingest/DocumentHandler.php
  4. 45
      app/Ingest/DocxConvertor.php
  5. 9
      app/Ingest/OCR.php
  6. 62
      app/Ingest/Office.php
  7. 18
      app/Ingest/OtherConvertor.php
  8. 104
      app/Ingest/PDFConvertor.php
  9. 66
      app/Jobs/IngestDocuments.php
  10. 7
      resources/python/dewarp/page_dewarp.py
  11. 29
      resources/python/ocr/localize_text_tesseract.py
  12. BIN
      resources/python/ocr/logo.jpg
  13. 2
      resources/python/ocr/results/.gitignore

85
app/Console/Commands/AnalyzePerformance.php

@ -0,0 +1,85 @@
<?php
namespace App\Console\Commands;
use App\Ingest\DocumentHandler;
use Illuminate\Console\Command;
use Illuminate\Http\UploadedFile;
use Illuminate\Support\Carbon;
use Illuminate\Support\Facades\Redis;
class AnalyzePerformance extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'analyze:run {path : The directory path}';
/**
* The console command description.
*
* @var string
*/
protected $description = 'Run analyzer on multiple files in a directory.';
/**
* Create a new command instance.
*
* @return void
*/
public function __construct()
{
parent::__construct();
}
/**
* Execute the console command.
*
*/
public function handle()
{
$directoryPath = $this->argument('path');
if ( ! is_dir($directoryPath)) {
$this->error('The path is invalid: not a directory.');
return;
}
$redis = Redis::connection();
$redis->set('analyze_performance_time', Carbon::now()->format('U'));
$redis->set('analyze_performance_path', $directoryPath);
$allFiles = $this->getDirContents($directoryPath);
$redis->set('analyze_performance_remaining_files', count($allFiles));
foreach ($allFiles as $index => $file) {
$handler = new DocumentHandler($index, new UploadedFile($file, "File {$index}"), false);
$handler->handle();
}
$this->info('Processing... When it\'s done the results will be added to the \'ingest_analyze_performance.txt\' file in the directory you have provided.');
}
protected function getDirContents($dir, &$results = array())
{
$files = scandir($dir);
foreach ($files as $key => $value) {
$path = realpath($dir . DIRECTORY_SEPARATOR . $value);
if (!is_dir($path)) {
$results[] = $path;
} else if ($value != "." && $value != "..") {
$this->getDirContents($path, $results);
}
}
return $results;
}
}

16
app/Ingest/Convertor.php

@ -45,22 +45,16 @@ class Convertor
private function convertToHtml() private function convertToHtml()
{ {
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$office = new Office();
$process = new Process([
'soffice',
'--headless',
'--convert-to',
$success = $office->run(
'html:HTML:EmbedImages', 'html:HTML:EmbedImages',
$this->storage->path($this->path), $this->storage->path($this->path),
'--outdir',
$this->storage->path('contracts') $this->storage->path('contracts')
]);
$process->run();
);
if (!$process->isSuccessful()) {
throw new ProcessFailedException($process);
if (! $success) {
throw new \Exception('Something went wrong while tried converting to HTML for file: ' . $this->path);
} }
$this->storage->delete($this->path); $this->storage->delete($this->path);

11
app/Ingest/DocumentHandler.php

@ -9,6 +9,7 @@ class DocumentHandler
{ {
protected $id; protected $id;
protected $document; protected $document;
protected $fromRequest;
const DOCX_MIME_TYPE = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'; const DOCX_MIME_TYPE = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
const DOC_MIME_TYPE = 'application/msword'; const DOC_MIME_TYPE = 'application/msword';
@ -34,18 +35,18 @@ class DocumentHandler
self::PLAIN_TEXT_TYPE => 'txt', self::PLAIN_TEXT_TYPE => 'txt',
]; ];
public function __construct($id, $document)
public function __construct($id, $document, $fromRequest = true)
{ {
$this->id = $id; $this->id = $id;
$this->document = $document; $this->document = $document;
$this->fromRequest = $fromRequest;
} }
public function handle() public function handle()
{ {
$storage = Storage::disk('local'); $storage = Storage::disk('local');
$file = request()->file('document');
$mimeType = $file->getClientMimeType();
$mimeType = $this->document->getClientMimeType();
if (!array_key_exists($mimeType, $this->supportedFiles)) { if (!array_key_exists($mimeType, $this->supportedFiles)) {
throw new \Exception('File not supported.'); throw new \Exception('File not supported.');
@ -55,8 +56,8 @@ class DocumentHandler
$id = str_replace(' ', '_', $this->id); $id = str_replace(' ', '_', $this->id);
$path = $storage->putFileAs("contracts/$id", $file, "document.$type");
$path = $storage->putFileAs("contracts/$id", $this->document, "document.$type");
IngestDocuments::dispatch($this->id, $path, $type);
IngestDocuments::dispatch($this->id, $path, $type, $this->fromRequest);
} }
} }

45
app/Ingest/DocxConvertor.php

@ -9,33 +9,33 @@ class DocxConvertor extends AbstractConvertor
{ {
public function execute() public function execute()
{ {
// $this->convertToText();
//
// $convertor = new TextConvertor($this->storage, "$this->directoryPath/document.txt");
//
// $convertor->execute();
$this->convertToPdfWithLibreOffice();
$this->convertToPDF();
$pdfFilePath = "$this->directoryPath/document.pdf";
$convertor = new PDFConvertor($this->storage, "$this->directoryPath/document.pdf");
if ( ! $this->storage->exists($pdfFilePath)) {
throw new \Exception('Failed to convert to PDF: ' . $pdfFilePath);
}
$convertor = new PDFConvertor($this->storage, $pdfFilePath);
$convertor->execute(); $convertor->execute();
} }
protected function convertToText()
protected function convertToPDF()
{ {
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$process = new Process([ $process = new Process([
'soffice',
'--headless',
'--convert-to',
'txt',
'unoconv',
'-f',
'pdf',
// '-c=socket,host=localhost,port=' . (2000 + rand(2, 7)) . ';urp;StarOffice.ComponentContext',
$this->storage->path($this->path), $this->storage->path($this->path),
'--outdir',
$this->storage->path($this->directoryPath)
]); ]);
$process->setTimeout(10);
$process->run(); $process->run();
if (!$process->isSuccessful()) { if (!$process->isSuccessful()) {
@ -45,21 +45,18 @@ class DocxConvertor extends AbstractConvertor
$this->deleteOriginalDocument(); $this->deleteOriginalDocument();
} }
protected function convertToPDF()
protected function convertToPdfWithLibreOffice()
{ {
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$office = new Office();
$process = new Process([
'unoconv',
'-f',
$success = $office->run(
'pdf', 'pdf',
$this->storage->path($this->path), $this->storage->path($this->path),
]);
$process->run();
$this->storage->path($this->directoryPath)
);
if (!$process->isSuccessful()) {
throw new ProcessFailedException($process);
if (! $success) {
throw new \Exception('Failed when converting from DOCX to PDF for file: ' . $this->path);
} }
$this->deleteOriginalDocument(); $this->deleteOriginalDocument();

9
app/Ingest/OCR.php

@ -52,13 +52,10 @@ class OCR
$directory = pathinfo($this->path, PATHINFO_DIRNAME); $directory = pathinfo($this->path, PATHINFO_DIRNAME);
$newPath = "$directory/$filePath"; $newPath = "$directory/$filePath";
$moved = File::move(base_path($filePath), $newPath);
if ( ! $moved) {
throw new \Exception('Something went wrong while moving file.');
// The file may not be created by the library for various reasons, including if it does not have text.
if (File::exists($newPath)) {
$this->path = $newPath;
} }
$this->path = $newPath;
} }
protected function applyDeskew() protected function applyDeskew()

62
app/Ingest/Office.php

@ -0,0 +1,62 @@
<?php
namespace App\Ingest;
use Symfony\Component\Process\Process;
class Office
{
protected $id;
protected $directory;
public function __construct()
{
$this->id = uniqid();
$this->directory = 'soffice-dir-' . $this->id;
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
}
public function run($convertTo, $filePath, $directoryPath)
{
$this->makeTemporaryDirectory();
$success = $this->runConversion($convertTo, $filePath, $directoryPath);
// @TODO Does not work at the moment.
// $this->removeTemporaryDirectory();
return $success;
}
protected function runConversion($convertTo, $filePath, $directoryPath)
{
$process = new Process([
'soffice',
'--accept="pipe,name=soffice-pipe-' . $this->id . ';urp;StarOffice.ServiceMananger"',
'-env:UserInstallation=file:///tmp/' . $this->directory,
'--headless',
'--convert-to',
$convertTo,
$filePath,
'--outdir',
$directoryPath
]);
$process->setTimeout(10);
$process->run();
return $process->isSuccessful();
}
protected function makeTemporaryDirectory()
{
(new Process(['mkdir /tmp/' . $this->directory]))->run();
}
protected function removeTemporaryDirectory()
{
(new Process(['rm -rf /tmp/' . $this->directory]))->run();
}
}

18
app/Ingest/OtherConvertor.php

@ -24,25 +24,19 @@ class OtherConvertor extends AbstractConvertor
*/ */
private function convertToDocx() private function convertToDocx()
{ {
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
/** /**
* Convert doc,dot,rtf,odt to docx * Convert doc,dot,rtf,odt to docx
*/ */
$process = new Process([
'soffice',
'--headless',
'--convert-to',
$office = new Office();
$success = $office->run(
'docx', 'docx',
$this->storage->path($this->path), $this->storage->path($this->path),
'--outdir',
$this->storage->path($this->directoryPath) $this->storage->path($this->directoryPath)
]);
$process->run();
);
if (!$process->isSuccessful()) {
throw new ProcessFailedException($process);
if (! $success) {
throw new \Exception('Something went wrong while tried converting to DOCX for file: ' . $this->path);
} }
$this->deleteOriginalDocument(); $this->deleteOriginalDocument();

104
app/Ingest/PDFConvertor.php

@ -12,47 +12,13 @@ class PDFConvertor extends AbstractConvertor
{ {
// $this->prepareForConvertPDF(); // $this->prepareForConvertPDF();
$result = $this->getFileContents();
$contents = $this->getFileContents();
if ( ! $result['has_images'] && ! $result['has_text']) {
if ( ! $contents) {
throw new \Exception('Cannot get pdf file contents.'); throw new \Exception('Cannot get pdf file contents.');
} }
if ($result['has_text']) {
$mdContents = '';
foreach ($result['htmls'] as $html) {
$converter = new HtmlConverter();
$converter->getConfig()->setOption('strip_tags', true);
$contents = $converter->convert($html);
$mdContents = $mdContents . "\n\n" . $contents;
}
$this->storage->put("$this->directoryPath/document.md", $mdContents);
return;
}
// Only contains images.
$imagesContent = '';
$files = $this->storage->allFiles($this->path);
foreach ($files as $file) {
// Only get the image files from the directory, it may contain some empty html files too.
// @TODO Only OCR images with text and delete them afterwards, the remaining ignore and keep.
if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) {
$ocr = new OCR($this->storage->path($file));
$imagesContent = $imagesContent . $ocr->execute();
$this->storage->delete($file);
}
}
$this->storage->put("$this->directoryPath/document.md", $imagesContent);
$this->storage->put("$this->directoryPath/document.md", $contents);
} }
protected function getFileContents() protected function getFileContents()
@ -115,13 +81,14 @@ class PDFConvertor extends AbstractConvertor
ksort($orderedList[$pageNumber]); ksort($orderedList[$pageNumber]);
} }
$htmls = [];
$hasImages = false; $hasImages = false;
$hasText = false; $hasText = false;
$imagesCount = 0; $imagesCount = 0;
$imagesInFooter = true; $imagesInFooter = true;
$mdContents = '';
try { try {
foreach ($orderedList as $page) { foreach ($orderedList as $page) {
$html = ''; $html = '';
@ -132,19 +99,38 @@ class PDFConvertor extends AbstractConvertor
foreach ($items as $p) { foreach ($items as $p) {
if ($p->getName() == 'image') { if ($p->getName() == 'image') {
$hasImages = true;
$basePath = $this->storage->path('');
$imageFilePath = str_replace($basePath, '', $p['src']);
$imagesCount += 1;
$caption = "Fig. $imagesCount";
$textContents = $this->applyOCR($imageFilePath);
$imageHTML = $this->handleImage($p, $caption);
if ($textContents) {
if ($html) {
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n";
if ( ! $imagesInFooter) {
$html = $html . $imageHTML;
$html = '';
}
$mdContents = $mdContents . $textContents . "\n";
$this->storage->delete($imageFilePath);
$hasText = true;
} else { } else {
$html = $html . "<p> $caption </p>";
$hasImages = true;
$imagesCount += 1;
$caption = "Fig. $imagesCount";
$footerImages[] = $imageHTML;
$imageHTML = $this->handleImage($p, $caption);
if ( ! $imagesInFooter) {
$html = $html . $imageHTML;
} else {
$html = $html . "<p> $caption </p>";
$footerImages[] = $imageHTML;
}
} }
} }
@ -161,11 +147,10 @@ class PDFConvertor extends AbstractConvertor
if ($imagesInFooter) { if ($imagesInFooter) {
foreach ($footerImages as $index => $footerImage) { foreach ($footerImages as $index => $footerImage) {
$html = $html . '<p>' . $footerImage . '</p>'; $html = $html . '<p>' . $footerImage . '</p>';
// $html = $html . '<p> Fig. ' . ($index + 1) . '</p>';
} }
} }
$htmls[] = '<html><head><title></title></head><body>' . $html . '</body></html>';
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n";
} }
} catch (\Exception $exception) { } catch (\Exception $exception) {
$this->storage->deleteDirectory($this->directoryPath); $this->storage->deleteDirectory($this->directoryPath);
@ -183,11 +168,7 @@ class PDFConvertor extends AbstractConvertor
$this->storage->delete($xmlFilePath); $this->storage->delete($xmlFilePath);
} }
return [
'has_images' => $hasImages,
'has_text' => $hasText,
'htmls' => $htmls,
];
return $mdContents;
} }
protected function handleImage($p, $caption) protected function handleImage($p, $caption)
@ -254,6 +235,23 @@ class PDFConvertor extends AbstractConvertor
return 'span'; return 'span';
} }
protected function applyOCR($path)
{
$ocr = new OCR($this->storage->path($path));
return $ocr->execute();
}
protected function convertHtmlToMD($contents)
{
$html = '<html><head><title></title></head><body>' . $contents . '</body></html>';
$converter = new HtmlConverter();
$converter->getConfig()->setOption('strip_tags', true);
return $converter->convert($html);
}
protected function prepareForConvertPDF() protected function prepareForConvertPDF()
{ {
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();

66
app/Jobs/IngestDocuments.php

@ -11,7 +11,9 @@ use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable; use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue; use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Support\Carbon;
use Illuminate\Support\Facades\Log; use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Redis;
use Illuminate\Support\Facades\Storage; use Illuminate\Support\Facades\Storage;
class IngestDocuments implements ShouldQueue class IngestDocuments implements ShouldQueue
@ -21,6 +23,7 @@ class IngestDocuments implements ShouldQueue
protected $id; protected $id;
private $path; private $path;
protected $type; protected $type;
protected $fromRequest;
/** /**
* @var \Illuminate\Contracts\Filesystem\Filesystem * @var \Illuminate\Contracts\Filesystem\Filesystem
@ -53,12 +56,14 @@ class IngestDocuments implements ShouldQueue
* @param $id * @param $id
* @param string $path * @param string $path
* @param $type * @param $type
* @param $fromRequest
*/ */
public function __construct($id, string $path, $type)
public function __construct($id, string $path, $type, $fromRequest)
{ {
$this->id = $id; $this->id = $id;
$this->path = $path; $this->path = $path;
$this->type = $type; $this->type = $type;
$this->fromRequest = $fromRequest;
$this->storage = Storage::disk('local'); $this->storage = Storage::disk('local');
$this->parserDocx = new ParseDocx(); $this->parserDocx = new ParseDocx();
@ -86,7 +91,17 @@ class IngestDocuments implements ShouldQueue
return; return;
} }
SendToCore::dispatch($this->id, pathinfo($this->path, PATHINFO_DIRNAME));
$directoryPath = pathinfo($this->path, PATHINFO_DIRNAME);
if ($this->fromRequest) {
SendToCore::dispatch($this->id, $directoryPath);
return;
}
$this->storage->deleteDirectory($directoryPath);
$this->updateAnalyzer();
} }
public function failed() public function failed()
@ -95,13 +110,48 @@ class IngestDocuments implements ShouldQueue
$this->storage = Storage::disk('local'); $this->storage = Storage::disk('local');
} }
Log::error('Ingest documents failed.');
Log::error('Ingest documents failed. ' . $this->path);
$directoryPath = pathinfo($this->path, PATHINFO_DIRNAME);
// // @TODO Delete docx, txt and md files.
// if ($this->storage->exists($this->path)) {
// $this->storage->delete($this->path);
// }
if ($this->fromRequest) {
SendToCore::dispatch($this->id, $directoryPath, true);
return;
}
$this->storage->deleteDirectory($directoryPath);
$this->updateAnalyzer(true);
}
protected function updateAnalyzer($failed = false)
{
$redis = Redis::connection();
if ($failed) {
$redis->set('analyze_performance_error', '1');
}
$remainingFiles = $redis->get('analyze_performance_remaining_files');
$remainingFiles -= 1;
if ($remainingFiles === 0) {
$startedAt = $redis->get('analyze_performance_time');
$endedAt = Carbon::now()->format('U');
$directoryPath = $redis->get('analyze_performance_path');
$data = 'Time elapsed in seconds: ' . ($endedAt - $startedAt) . "\n";
if ($failed) {
$data = $data . 'Something went wrong while processing the files.';
}
file_put_contents($directoryPath . '/ingest_analyze_performance.txt', $data);
return;
}
SendToCore::dispatch($this->id, pathinfo($this->path, PATHINFO_DIRNAME), true);
$redis->set('analyze_performance_remaining_files', $remainingFiles);
} }
} }

7
resources/python/dewarp/page_dewarp.py

@ -785,7 +785,7 @@ def get_page_dims(corners, rough_dims, params):
return dims return dims
def remap_image(name, img, small, page_dims, params):
def remap_image(name, dirname, img, small, page_dims, params):
height = 0.5 * page_dims[1] * OUTPUT_ZOOM * img.shape[0] height = 0.5 * page_dims[1] * OUTPUT_ZOOM * img.shape[0]
height = round_nearest_multiple(height, REMAP_DECIMATE) height = round_nearest_multiple(height, REMAP_DECIMATE)
@ -833,7 +833,7 @@ def remap_image(name, img, small, page_dims, params):
pil_image = pil_image.convert('1') pil_image = pil_image.convert('1')
threshfile = name + '_thresh.png' threshfile = name + '_thresh.png'
pil_image.save(threshfile, dpi=(OUTPUT_DPI, OUTPUT_DPI))
pil_image.save(dirname + '/' + threshfile, dpi=(OUTPUT_DPI, OUTPUT_DPI))
if DEBUG_LEVEL >= 1: if DEBUG_LEVEL >= 1:
height = small.shape[0] height = small.shape[0]
@ -861,6 +861,7 @@ def main():
img = cv2.imread(imgfile) img = cv2.imread(imgfile)
small = resize_to_screen(img) small = resize_to_screen(img)
basename = os.path.basename(imgfile) basename = os.path.basename(imgfile)
dirname = os.path.dirname(imgfile)
name, _ = os.path.splitext(basename) name, _ = os.path.splitext(basename)
print('loaded', basename, 'with size', imgsize(img), end=' ') print('loaded', basename, 'with size', imgsize(img), end=' ')
@ -907,7 +908,7 @@ def main():
page_dims = get_page_dims(corners, rough_dims, params) page_dims = get_page_dims(corners, rough_dims, params)
outfile = remap_image(name, img, small, page_dims, params)
outfile = remap_image(name, dirname, img, small, page_dims, params)
outfiles.append(outfile) outfiles.append(outfile)

29
resources/python/ocr/localize_text_tesseract.py

@ -0,0 +1,29 @@
import cv2
image = cv2.imread("logo.jpg", 1)
img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
cv2.threshold(img, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU, img)
cv2.bitwise_not(img, img)
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 5))
img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, rect_kernel)
contours, hier = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
if len(contours) != 0:
ROI_number = 0
for c in contours:
x,y,w,h = cv2.boundingRect(c)
# Depends on text size, so the greater the value the less objects we get.
if (h > 50):
cv2.rectangle(image, (x,y), (x+w,y+h), (0,0,255), 1)
ROI = image[y:y+h, x:x+w]
cv2.imwrite('results/ROI_{}.png'.format(ROI_number), ROI)
ROI_number += 1
cv2.imshow("Result", image)
cv2.waitKey(0)

BIN
resources/python/ocr/logo.jpg

After

Width: 638  |  Height: 359  |  Size: 19 KiB

2
resources/python/ocr/results/.gitignore

@ -0,0 +1,2 @@
*
!.gitignore
Loading…
Cancel
Save