Browse Source

Performance analyzer. Apply OCR on images in digital documents

hidden_tags_with_bookmarks
Orzu Ionut 3 years ago
parent
commit
70debcfc58
  1. 85
      app/Console/Commands/AnalyzePerformance.php
  2. 16
      app/Ingest/Convertor.php
  3. 11
      app/Ingest/DocumentHandler.php
  4. 45
      app/Ingest/DocxConvertor.php
  5. 9
      app/Ingest/OCR.php
  6. 62
      app/Ingest/Office.php
  7. 18
      app/Ingest/OtherConvertor.php
  8. 104
      app/Ingest/PDFConvertor.php
  9. 66
      app/Jobs/IngestDocuments.php
  10. 7
      resources/python/dewarp/page_dewarp.py
  11. 29
      resources/python/ocr/localize_text_tesseract.py
  12. BIN
      resources/python/ocr/logo.jpg
  13. 2
      resources/python/ocr/results/.gitignore

85
app/Console/Commands/AnalyzePerformance.php

@ -0,0 +1,85 @@
<?php
namespace App\Console\Commands;
use App\Ingest\DocumentHandler;
use Illuminate\Console\Command;
use Illuminate\Http\UploadedFile;
use Illuminate\Support\Carbon;
use Illuminate\Support\Facades\Redis;
class AnalyzePerformance extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'analyze:run {path : The directory path}';
/**
* The console command description.
*
* @var string
*/
protected $description = 'Run analyzer on multiple files in a directory.';
/**
* Create a new command instance.
*
* @return void
*/
public function __construct()
{
parent::__construct();
}
/**
* Execute the console command.
*
*/
public function handle()
{
$directoryPath = $this->argument('path');
if ( ! is_dir($directoryPath)) {
$this->error('The path is invalid: not a directory.');
return;
}
$redis = Redis::connection();
$redis->set('analyze_performance_time', Carbon::now()->format('U'));
$redis->set('analyze_performance_path', $directoryPath);
$allFiles = $this->getDirContents($directoryPath);
$redis->set('analyze_performance_remaining_files', count($allFiles));
foreach ($allFiles as $index => $file) {
$handler = new DocumentHandler($index, new UploadedFile($file, "File {$index}"), false);
$handler->handle();
}
$this->info('Processing... When it\'s done the results will be added to the \'ingest_analyze_performance.txt\' file in the directory you have provided.');
}
protected function getDirContents($dir, &$results = array())
{
$files = scandir($dir);
foreach ($files as $key => $value) {
$path = realpath($dir . DIRECTORY_SEPARATOR . $value);
if (!is_dir($path)) {
$results[] = $path;
} else if ($value != "." && $value != "..") {
$this->getDirContents($path, $results);
}
}
return $results;
}
}

16
app/Ingest/Convertor.php

@ -45,22 +45,16 @@ class Convertor
private function convertToHtml()
{
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$office = new Office();
$process = new Process([
'soffice',
'--headless',
'--convert-to',
$success = $office->run(
'html:HTML:EmbedImages',
$this->storage->path($this->path),
'--outdir',
$this->storage->path('contracts')
]);
$process->run();
);
if (!$process->isSuccessful()) {
throw new ProcessFailedException($process);
if (! $success) {
throw new \Exception('Something went wrong while tried converting to HTML for file: ' . $this->path);
}
$this->storage->delete($this->path);

11
app/Ingest/DocumentHandler.php

@ -9,6 +9,7 @@ class DocumentHandler
{
protected $id;
protected $document;
protected $fromRequest;
const DOCX_MIME_TYPE = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
const DOC_MIME_TYPE = 'application/msword';
@ -34,18 +35,18 @@ class DocumentHandler
self::PLAIN_TEXT_TYPE => 'txt',
];
public function __construct($id, $document)
public function __construct($id, $document, $fromRequest = true)
{
$this->id = $id;
$this->document = $document;
$this->fromRequest = $fromRequest;
}
public function handle()
{
$storage = Storage::disk('local');
$file = request()->file('document');
$mimeType = $file->getClientMimeType();
$mimeType = $this->document->getClientMimeType();
if (!array_key_exists($mimeType, $this->supportedFiles)) {
throw new \Exception('File not supported.');
@ -55,8 +56,8 @@ class DocumentHandler
$id = str_replace(' ', '_', $this->id);
$path = $storage->putFileAs("contracts/$id", $file, "document.$type");
$path = $storage->putFileAs("contracts/$id", $this->document, "document.$type");
IngestDocuments::dispatch($this->id, $path, $type);
IngestDocuments::dispatch($this->id, $path, $type, $this->fromRequest);
}
}

45
app/Ingest/DocxConvertor.php

@ -9,33 +9,33 @@ class DocxConvertor extends AbstractConvertor
{
public function execute()
{
// $this->convertToText();
//
// $convertor = new TextConvertor($this->storage, "$this->directoryPath/document.txt");
//
// $convertor->execute();
$this->convertToPdfWithLibreOffice();
$this->convertToPDF();
$pdfFilePath = "$this->directoryPath/document.pdf";
$convertor = new PDFConvertor($this->storage, "$this->directoryPath/document.pdf");
if ( ! $this->storage->exists($pdfFilePath)) {
throw new \Exception('Failed to convert to PDF: ' . $pdfFilePath);
}
$convertor = new PDFConvertor($this->storage, $pdfFilePath);
$convertor->execute();
}
protected function convertToText()
protected function convertToPDF()
{
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$process = new Process([
'soffice',
'--headless',
'--convert-to',
'txt',
'unoconv',
'-f',
'pdf',
// '-c=socket,host=localhost,port=' . (2000 + rand(2, 7)) . ';urp;StarOffice.ComponentContext',
$this->storage->path($this->path),
'--outdir',
$this->storage->path($this->directoryPath)
]);
$process->setTimeout(10);
$process->run();
if (!$process->isSuccessful()) {
@ -45,21 +45,18 @@ class DocxConvertor extends AbstractConvertor
$this->deleteOriginalDocument();
}
protected function convertToPDF()
protected function convertToPdfWithLibreOffice()
{
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$office = new Office();
$process = new Process([
'unoconv',
'-f',
$success = $office->run(
'pdf',
$this->storage->path($this->path),
]);
$process->run();
$this->storage->path($this->directoryPath)
);
if (!$process->isSuccessful()) {
throw new ProcessFailedException($process);
if (! $success) {
throw new \Exception('Failed when converting from DOCX to PDF for file: ' . $this->path);
}
$this->deleteOriginalDocument();

9
app/Ingest/OCR.php

@ -52,13 +52,10 @@ class OCR
$directory = pathinfo($this->path, PATHINFO_DIRNAME);
$newPath = "$directory/$filePath";
$moved = File::move(base_path($filePath), $newPath);
if ( ! $moved) {
throw new \Exception('Something went wrong while moving file.');
// The file may not be created by the library for various reasons, including if it does not have text.
if (File::exists($newPath)) {
$this->path = $newPath;
}
$this->path = $newPath;
}
protected function applyDeskew()

62
app/Ingest/Office.php

@ -0,0 +1,62 @@
<?php
namespace App\Ingest;
use Symfony\Component\Process\Process;
class Office
{
protected $id;
protected $directory;
public function __construct()
{
$this->id = uniqid();
$this->directory = 'soffice-dir-' . $this->id;
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
}
public function run($convertTo, $filePath, $directoryPath)
{
$this->makeTemporaryDirectory();
$success = $this->runConversion($convertTo, $filePath, $directoryPath);
// @TODO Does not work at the moment.
// $this->removeTemporaryDirectory();
return $success;
}
protected function runConversion($convertTo, $filePath, $directoryPath)
{
$process = new Process([
'soffice',
'--accept="pipe,name=soffice-pipe-' . $this->id . ';urp;StarOffice.ServiceMananger"',
'-env:UserInstallation=file:///tmp/' . $this->directory,
'--headless',
'--convert-to',
$convertTo,
$filePath,
'--outdir',
$directoryPath
]);
$process->setTimeout(10);
$process->run();
return $process->isSuccessful();
}
protected function makeTemporaryDirectory()
{
(new Process(['mkdir /tmp/' . $this->directory]))->run();
}
protected function removeTemporaryDirectory()
{
(new Process(['rm -rf /tmp/' . $this->directory]))->run();
}
}

18
app/Ingest/OtherConvertor.php

@ -24,25 +24,19 @@ class OtherConvertor extends AbstractConvertor
*/
private function convertToDocx()
{
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
/**
* Convert doc,dot,rtf,odt to docx
*/
$process = new Process([
'soffice',
'--headless',
'--convert-to',
$office = new Office();
$success = $office->run(
'docx',
$this->storage->path($this->path),
'--outdir',
$this->storage->path($this->directoryPath)
]);
$process->run();
);
if (!$process->isSuccessful()) {
throw new ProcessFailedException($process);
if (! $success) {
throw new \Exception('Something went wrong while tried converting to DOCX for file: ' . $this->path);
}
$this->deleteOriginalDocument();

104
app/Ingest/PDFConvertor.php

@ -12,47 +12,13 @@ class PDFConvertor extends AbstractConvertor
{
// $this->prepareForConvertPDF();
$result = $this->getFileContents();
$contents = $this->getFileContents();
if ( ! $result['has_images'] && ! $result['has_text']) {
if ( ! $contents) {
throw new \Exception('Cannot get pdf file contents.');
}
if ($result['has_text']) {
$mdContents = '';
foreach ($result['htmls'] as $html) {
$converter = new HtmlConverter();
$converter->getConfig()->setOption('strip_tags', true);
$contents = $converter->convert($html);
$mdContents = $mdContents . "\n\n" . $contents;
}
$this->storage->put("$this->directoryPath/document.md", $mdContents);
return;
}
// Only contains images.
$imagesContent = '';
$files = $this->storage->allFiles($this->path);
foreach ($files as $file) {
// Only get the image files from the directory, it may contain some empty html files too.
// @TODO Only OCR images with text and delete them afterwards, the remaining ignore and keep.
if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) {
$ocr = new OCR($this->storage->path($file));
$imagesContent = $imagesContent . $ocr->execute();
$this->storage->delete($file);
}
}
$this->storage->put("$this->directoryPath/document.md", $imagesContent);
$this->storage->put("$this->directoryPath/document.md", $contents);
}
protected function getFileContents()
@ -115,13 +81,14 @@ class PDFConvertor extends AbstractConvertor
ksort($orderedList[$pageNumber]);
}
$htmls = [];
$hasImages = false;
$hasText = false;
$imagesCount = 0;
$imagesInFooter = true;
$mdContents = '';
try {
foreach ($orderedList as $page) {
$html = '';
@ -132,19 +99,38 @@ class PDFConvertor extends AbstractConvertor
foreach ($items as $p) {
if ($p->getName() == 'image') {
$hasImages = true;
$basePath = $this->storage->path('');
$imageFilePath = str_replace($basePath, '', $p['src']);
$imagesCount += 1;
$caption = "Fig. $imagesCount";
$textContents = $this->applyOCR($imageFilePath);
$imageHTML = $this->handleImage($p, $caption);
if ($textContents) {
if ($html) {
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n";
if ( ! $imagesInFooter) {
$html = $html . $imageHTML;
$html = '';
}
$mdContents = $mdContents . $textContents . "\n";
$this->storage->delete($imageFilePath);
$hasText = true;
} else {
$html = $html . "<p> $caption </p>";
$hasImages = true;
$imagesCount += 1;
$caption = "Fig. $imagesCount";
$footerImages[] = $imageHTML;
$imageHTML = $this->handleImage($p, $caption);
if ( ! $imagesInFooter) {
$html = $html . $imageHTML;
} else {
$html = $html . "<p> $caption </p>";
$footerImages[] = $imageHTML;
}
}
}
@ -161,11 +147,10 @@ class PDFConvertor extends AbstractConvertor
if ($imagesInFooter) {
foreach ($footerImages as $index => $footerImage) {
$html = $html . '<p>' . $footerImage . '</p>';
// $html = $html . '<p> Fig. ' . ($index + 1) . '</p>';
}
}
$htmls[] = '<html><head><title></title></head><body>' . $html . '</body></html>';
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n";
}
} catch (\Exception $exception) {
$this->storage->deleteDirectory($this->directoryPath);
@ -183,11 +168,7 @@ class PDFConvertor extends AbstractConvertor
$this->storage->delete($xmlFilePath);
}
return [
'has_images' => $hasImages,
'has_text' => $hasText,
'htmls' => $htmls,
];
return $mdContents;
}
protected function handleImage($p, $caption)
@ -254,6 +235,23 @@ class PDFConvertor extends AbstractConvertor
return 'span';
}
protected function applyOCR($path)
{
$ocr = new OCR($this->storage->path($path));
return $ocr->execute();
}
protected function convertHtmlToMD($contents)
{
$html = '<html><head><title></title></head><body>' . $contents . '</body></html>';
$converter = new HtmlConverter();
$converter->getConfig()->setOption('strip_tags', true);
return $converter->convert($html);
}
protected function prepareForConvertPDF()
{
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();

66
app/Jobs/IngestDocuments.php

@ -11,7 +11,9 @@ use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Support\Carbon;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Redis;
use Illuminate\Support\Facades\Storage;
class IngestDocuments implements ShouldQueue
@ -21,6 +23,7 @@ class IngestDocuments implements ShouldQueue
protected $id;
private $path;
protected $type;
protected $fromRequest;
/**
* @var \Illuminate\Contracts\Filesystem\Filesystem
@ -53,12 +56,14 @@ class IngestDocuments implements ShouldQueue
* @param $id
* @param string $path
* @param $type
* @param $fromRequest
*/
public function __construct($id, string $path, $type)
public function __construct($id, string $path, $type, $fromRequest)
{
$this->id = $id;
$this->path = $path;
$this->type = $type;
$this->fromRequest = $fromRequest;
$this->storage = Storage::disk('local');
$this->parserDocx = new ParseDocx();
@ -86,7 +91,17 @@ class IngestDocuments implements ShouldQueue
return;
}
SendToCore::dispatch($this->id, pathinfo($this->path, PATHINFO_DIRNAME));
$directoryPath = pathinfo($this->path, PATHINFO_DIRNAME);
if ($this->fromRequest) {
SendToCore::dispatch($this->id, $directoryPath);
return;
}
$this->storage->deleteDirectory($directoryPath);
$this->updateAnalyzer();
}
public function failed()
@ -95,13 +110,48 @@ class IngestDocuments implements ShouldQueue
$this->storage = Storage::disk('local');
}
Log::error('Ingest documents failed.');
Log::error('Ingest documents failed. ' . $this->path);
$directoryPath = pathinfo($this->path, PATHINFO_DIRNAME);
// // @TODO Delete docx, txt and md files.
// if ($this->storage->exists($this->path)) {
// $this->storage->delete($this->path);
// }
if ($this->fromRequest) {
SendToCore::dispatch($this->id, $directoryPath, true);
return;
}
$this->storage->deleteDirectory($directoryPath);
$this->updateAnalyzer(true);
}
protected function updateAnalyzer($failed = false)
{
$redis = Redis::connection();
if ($failed) {
$redis->set('analyze_performance_error', '1');
}
$remainingFiles = $redis->get('analyze_performance_remaining_files');
$remainingFiles -= 1;
if ($remainingFiles === 0) {
$startedAt = $redis->get('analyze_performance_time');
$endedAt = Carbon::now()->format('U');
$directoryPath = $redis->get('analyze_performance_path');
$data = 'Time elapsed in seconds: ' . ($endedAt - $startedAt) . "\n";
if ($failed) {
$data = $data . 'Something went wrong while processing the files.';
}
file_put_contents($directoryPath . '/ingest_analyze_performance.txt', $data);
return;
}
SendToCore::dispatch($this->id, pathinfo($this->path, PATHINFO_DIRNAME), true);
$redis->set('analyze_performance_remaining_files', $remainingFiles);
}
}

7
resources/python/dewarp/page_dewarp.py

@ -785,7 +785,7 @@ def get_page_dims(corners, rough_dims, params):
return dims
def remap_image(name, img, small, page_dims, params):
def remap_image(name, dirname, img, small, page_dims, params):
height = 0.5 * page_dims[1] * OUTPUT_ZOOM * img.shape[0]
height = round_nearest_multiple(height, REMAP_DECIMATE)
@ -833,7 +833,7 @@ def remap_image(name, img, small, page_dims, params):
pil_image = pil_image.convert('1')
threshfile = name + '_thresh.png'
pil_image.save(threshfile, dpi=(OUTPUT_DPI, OUTPUT_DPI))
pil_image.save(dirname + '/' + threshfile, dpi=(OUTPUT_DPI, OUTPUT_DPI))
if DEBUG_LEVEL >= 1:
height = small.shape[0]
@ -861,6 +861,7 @@ def main():
img = cv2.imread(imgfile)
small = resize_to_screen(img)
basename = os.path.basename(imgfile)
dirname = os.path.dirname(imgfile)
name, _ = os.path.splitext(basename)
print('loaded', basename, 'with size', imgsize(img), end=' ')
@ -907,7 +908,7 @@ def main():
page_dims = get_page_dims(corners, rough_dims, params)
outfile = remap_image(name, img, small, page_dims, params)
outfile = remap_image(name, dirname, img, small, page_dims, params)
outfiles.append(outfile)

29
resources/python/ocr/localize_text_tesseract.py

@ -0,0 +1,29 @@
import cv2
image = cv2.imread("logo.jpg", 1)
img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
cv2.threshold(img, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU, img)
cv2.bitwise_not(img, img)
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 5))
img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, rect_kernel)
contours, hier = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
if len(contours) != 0:
ROI_number = 0
for c in contours:
x,y,w,h = cv2.boundingRect(c)
# Depends on text size, so the greater the value the less objects we get.
if (h > 50):
cv2.rectangle(image, (x,y), (x+w,y+h), (0,0,255), 1)
ROI = image[y:y+h, x:x+w]
cv2.imwrite('results/ROI_{}.png'.format(ROI_number), ROI)
ROI_number += 1
cv2.imshow("Result", image)
cv2.waitKey(0)

BIN
resources/python/ocr/logo.jpg

After

Width: 638  |  Height: 359  |  Size: 19 KiB

2
resources/python/ocr/results/.gitignore

@ -0,0 +1,2 @@
*
!.gitignore
Loading…
Cancel
Save