Browse Source

Recreate orginal document with S&D data

hidden_tags_with_bookmarks
Orzu Ionut 3 years ago
parent
commit
332fa674eb
  1. 3
      .env.example
  2. 7
      app/Console/Commands/AnalyzePerformance.php
  3. 8
      app/Http/Controllers/IngestController.php
  4. 52
      app/Http/Controllers/RecreateDocumentController.php
  5. 6
      app/Ingest/AbstractConvertor.php
  6. 1
      app/Ingest/Convertor.php
  7. 72
      app/Ingest/DataJsonConvertor.php
  8. 20
      app/Ingest/DocumentHandler.php
  9. 8
      app/Ingest/DocxConvertor.php
  10. 771
      app/Ingest/DocxReader.php
  11. 291
      app/Ingest/DocxWriter.php
  12. 2
      app/Ingest/OCR.php
  13. 8
      app/Ingest/PDFConvertor.php
  14. 79
      app/Jobs/IngestDocuments.php
  15. 131
      app/Jobs/RecreateDocument.php
  16. 53
      app/Jobs/SendToCore.php
  17. 3
      composer.json
  18. 2
      routes/web.php
  19. 30
      tests/Feature/IngestDocxTest.php
  20. 36
      tests/Feature/ProcessDocxDocumentTest.php

3
.env.example

@ -15,6 +15,9 @@ SESSION_LIFETIME=120
REDIS_HOST=127.0.0.1
REDIS_PASSWORD=null
REDIS_PORT=6379
REDIS_QUEUE=
WEBHOOK_CORE_URL=
WEBHOOK_CORE_SECRET=
USER_HOME_PATH=

7
app/Console/Commands/AnalyzePerformance.php

@ -58,7 +58,12 @@ class AnalyzePerformance extends Command
$redis->set('analyze_performance_remaining_files', count($allFiles));
foreach ($allFiles as $index => $file) {
$handler = new DocumentHandler($index, new UploadedFile($file, "File {$index}"), false);
$handler = new DocumentHandler(
$index,
'md',
new UploadedFile($file, "File {$index}"),
false
);
$handler->handle();
}

8
app/Http/Controllers/IngestController.php

@ -10,11 +10,17 @@ class IngestController extends Controller
{
request()->validate([
'id' => 'required',
'file_result_type' => 'required|in:md,original',
'document' => 'required|file',
]);
try {
$handler = new DocumentHandler(request()->get('id'), request()->file('document'));
$handler = new DocumentHandler(
request()->get('id'),
request()->get('file_result_type'),
request()->get('mime_type'),
request()->file('document')
);
$handler->handle();

52
app/Http/Controllers/RecreateDocumentController.php

@ -0,0 +1,52 @@
<?php
namespace App\Http\Controllers;
use App\Jobs\RecreateDocument;
use Illuminate\Support\Facades\Storage;
class RecreateDocumentController extends Controller
{
public function store()
{
request()->validate([
'id' => 'required',
'data' => 'required',
]);
$id = request()->get('id');
$data = json_decode(request()->get('data'), true);
try {
RecreateDocument::dispatch($id, $data);
return response()->json([
'status' => 'processing',
]);
} catch (\Exception $exception) {
return response()->json([
'status' => 'fail',
'message' => $exception->getMessage(),
], 400);
}
}
public function show($id)
{
request()->validate([
'file_path' => 'required',
]);
$filePath = request()->get('file_path');
$storage = Storage::disk('local');
$fullPath = 'contracts/' . $id . '-' . $filePath;
if ( ! $storage->exists($fullPath)) {
return '';
}
return response()->download($storage->path($fullPath), 'document.docx', [])
->deleteFileAfterSend(true);
}
}

6
app/Ingest/AbstractConvertor.php

@ -17,6 +17,12 @@ abstract class AbstractConvertor
abstract public function execute();
public function setPath($path)
{
$this->path = $path;
$this->directoryPath = pathinfo($path, PATHINFO_DIRNAME);
}
protected function deleteOriginalDocument()
{
$this->storage->delete($this->path);

1
app/Ingest/Convertor.php

@ -23,7 +23,6 @@ class Convertor
}
/**
* @return mixed
* @throws \Exception
*/
public function execute()

72
app/Ingest/DataJsonConvertor.php

@ -0,0 +1,72 @@
<?php
namespace App\Ingest;
use Illuminate\Support\Facades\Storage;
class DataJsonConvertor extends AbstractConvertor
{
protected $type;
public function __construct($path, $type)
{
parent::__construct(Storage::disk('local'), $path);
$this->type = $type;
}
/**
* Convert given document to JSON file which contains the document's data.
*
* @throws \Exception
*/
public function execute()
{
// if ($this->type === 'pdf') {
// $this->convertToDocx();
// }
if ($this->type !== 'docx') {
$this->convertToDocx();
}
$json = $this->convertDocxToJson();
$this->storage->put("$this->directoryPath/document.json", json_encode($json));
$this->deleteOriginalDocument();
}
protected function convertDocxToJson()
{
$reader = new DocxReader($this->storage, $this->path);
return $reader->execute();
}
/**
* Convert document to DOCX format in order to extract data.
*
* @throws \Exception
*/
protected function convertToDocx()
{
$office = new Office();
$success = $office->run(
'docx',
$this->storage->path($this->path),
$this->storage->path($this->directoryPath)
);
if (! $success) {
throw new \Exception('Failed when converting from ' . $this->type . ' to DOCX for file: ' . $this->path);
}
$this->deleteOriginalDocument();
$this->setPath(str_replace($this->type, 'docx', $this->path));
$this->type = 'docx';
}
}

20
app/Ingest/DocumentHandler.php

@ -8,6 +8,8 @@ use Illuminate\Support\Facades\Storage;
class DocumentHandler
{
protected $id;
protected $fileResultType;
protected $mimeType;
protected $document;
protected $fromRequest;
@ -35,9 +37,11 @@ class DocumentHandler
self::PLAIN_TEXT_TYPE => 'txt',
];
public function __construct($id, $document, $fromRequest = true)
public function __construct($id, $fileResultType, $mimeType, $document, $fromRequest = true)
{
$this->id = $id;
$this->fileResultType = $fileResultType;
$this->mimeType = $mimeType;
$this->document = $document;
$this->fromRequest = $fromRequest;
}
@ -46,18 +50,22 @@ class DocumentHandler
{
$storage = Storage::disk('local');
$mimeType = $this->document->getClientMimeType();
if (!array_key_exists($mimeType, $this->supportedFiles)) {
if (!array_key_exists($this->mimeType, $this->supportedFiles)) {
throw new \Exception('File not supported.');
}
$type = $this->supportedFiles[$mimeType];
$type = $this->supportedFiles[$this->mimeType];
$id = str_replace(' ', '_', $this->id);
$path = $storage->putFileAs("contracts/$id", $this->document, "document.$type");
IngestDocuments::dispatch($this->id, $path, $type, $this->fromRequest);
IngestDocuments::dispatch(
$this->id,
$this->fileResultType,
$path,
$type,
$this->fromRequest
);
}
}

8
app/Ingest/DocxConvertor.php

@ -7,6 +7,10 @@ use Symfony\Component\Process\Process;
class DocxConvertor extends AbstractConvertor
{
/**
*
* @throws \Exception
*/
public function execute()
{
$this->convertToPdfWithLibreOffice();
@ -45,6 +49,10 @@ class DocxConvertor extends AbstractConvertor
$this->deleteOriginalDocument();
}
/**
*
* @throws \Exception
*/
protected function convertToPdfWithLibreOffice()
{
$office = new Office();

771
app/Ingest/DocxReader.php

@ -0,0 +1,771 @@
<?php
namespace App\Ingest;
use PhpOffice\PhpWord\Element\AbstractElement;
use PhpOffice\PhpWord\Element\Bookmark;
use PhpOffice\PhpWord\Element\Header;
use PhpOffice\PhpWord\Element\Image;
use PhpOffice\PhpWord\Element\Line;
use PhpOffice\PhpWord\Element\Link;
use PhpOffice\PhpWord\Element\ListItem;
use PhpOffice\PhpWord\Element\ListItemRun;
use PhpOffice\PhpWord\Element\PageBreak;
use PhpOffice\PhpWord\Element\PreserveText;
use PhpOffice\PhpWord\Element\Section;
use PhpOffice\PhpWord\Element\Table;
use PhpOffice\PhpWord\Element\Text;
use PhpOffice\PhpWord\Element\TextBreak;
use PhpOffice\PhpWord\Element\TextRun;
use PhpOffice\PhpWord\Element\Title;
use PhpOffice\PhpWord\IOFactory;
use PhpOffice\PhpWord\Style;
class DocxReader extends AbstractConvertor
{
protected $textContents;
protected $textLength;
// protected $readersMapper;
public function __construct($storage, $path)
{
parent::__construct($storage, $path);
$this->textContents = [
'text' => '',
'elements' => [],
];
$this->textLength = 0;
// $this->readersMapper = [
// 'docx' => 'Word2007',
// 'odt' => 'ODText',
// 'rtf' => 'RTF',
// ];
}
public function execute()
{
// Converting to HTML and then back to DOCX loses some content and styles (lost when converted to HTML).
$data = [];
// $extension = pathinfo($this->path)['extension'];
// $readerName = array_key_exists($extension, $this->readersMapper)
// ? $this->readersMapper[$extension]
// : 'Word2007';
// $handler = IOFactory::load($this->storage->path($this->path), $readerName);
$handler = IOFactory::load($this->storage->path($this->path));
$data['default_font_name'] = $handler->getDefaultFontName();
$data['default_font_size'] = $handler->getDefaultFontSize();
$data['styles'] = $this->handleStyles(Style::getStyles());
$elements = [];
$sections = $handler->getSections();
foreach ($sections as $section) {
$elements[] = $this->sectionToJson($section);
}
$data['elements'] = $elements;
$data['contents'] = $this->textContents;
return $data;
}
protected function sectionToJson(Section $section)
{
return array_merge($this->elementToJson($section), [
'element_type' => 'Section',
'elements' => $this->elementsToJson($section->getElements()),
'footers' => $this->handleFooters($section->getFooters()),
// 'footnote_properties' => $this->handleFooters($section->getFootnotePropoperties()),
'headers' => $this->handleHeaders($section->getHeaders()),
'style' => $this->sectionStyleToJson($section->getStyle()),
]);
}
protected function handleStyles($styles)
{
$self = $this;
return array_map(function ($style) use ($self) {
$class = get_class($style);
$a = explode('\\', $class);
$a = array_reverse($a);
$className = $a[0];
$handler = lcfirst($className) . 'StyleToJson';
return $self->$handler($style);
}, $styles);
}
protected function handleHeaders(array $headers)
{
$list = [];
foreach ($headers as $header) {
$list[] = $this->headerToJson($header);
}
return $list;
}
protected function handleFooters(array $footers)
{
$list = [];
foreach ($footers as $footer) {
$list[] = $this->footerToJson($footer);
}
return $list;
}
protected function handleElement($element)
{
$class = get_class($element);
$a = explode('\\', $class);
$a = array_reverse($a);
$className = $a[0];
$handler = lcfirst($className) . 'ToJson';
return array_merge(
$this->elementToJson($element),
$this->$handler($element)
);
}
protected function elementToJson(AbstractElement $element)
{
return [
// 'comment_range_end' => $element->getCommentRangeEnd(),
// 'comment_range_start' => $element->getCommentRangeStart(),
'doc_part' => $element->getDocPart(),
'doc_part_id' => $element->getDocPartId(),
'element_id' => $element->getElementId(),
'element_index' => $element->getElementIndex(),
// 'parent' => $element->getParent(),
'nested_level' => $element->getNestedLevel(),
'relation_id' => $element->getRelationId(),
'section_id' => $element->getSectionId(),
];
}
protected function elementsToJson(array $elements)
{
$list = [];
foreach ($elements as $element) {
$list[] = $this->handleElement($element);
}
return $list;
}
protected function headerToJson(Header $header)
{
return array_merge(
$this->footerToJson($header),
[
'element_type' => 'Header',
]
);
}
protected function footerToJson($footer)
{
return array_merge($this->elementToJson($footer), [
'element_type' => 'Footer',
'elements' => $this->elementsToJson($footer->getElements()),
'type' => $footer->getType(),
]);
}
protected function bookmarkToJson(Bookmark $element)
{
return array_merge($this->elementToJson($element), [
'element_type' => 'Bookmark',
'name' => $element->getName(),
]);
}
protected function imageToJson(Image $image)
{
return array_merge($this->elementToJson($image), [
'element_type' => 'Image',
'name' => $image->getName(),
'style' => $this->imageStyleToJson($image->getStyle()),
'source' => $image->getSource(),
'source_type' => $image->getSourceType(),
'is_watermark' => $image->isWatermark(),
]);
}
protected function lineToJson(Line $element)
{
return array_merge($this->elementToJson($element), [
'element_type' => 'Line',
'style' => $this->lineStyleToJson($element->getStyle()),
]);
}
protected function linkToJson(Link $element)
{
return [
'element_type' => 'Link',
];
}
protected function listItemToJson(ListItem $element)
{
return array_merge($this->elementToJson($element), [
'element_type' => 'ListItem',
'depth' => $element->getDepth(),
'style' => $this->listItemStyleToJson($element->getStyle()),
'text' => $element->getText(),
'text_object' => $this->textToJson($element->getTextObject()),
]);
}
protected function listItemRunToJson(ListItemRun $element)
{
return array_merge($this->textRunToJson($element), [
'element_type' => 'ListItemRun',
'depth' => $element->getDepth(),
'style' => $this->listItemStyleToJson($element->getStyle()),
]);
}
protected function preserveTextToJson(PreserveText $element)
{
$fontStyle = $element->getFontStyle();
$paragraphStyle = $element->getParagraphStyle();
$text = $element->getText();
$text = is_array($text) ? $text[0] : $text;
$hash = $this->addText($text);
return array_merge($this->elementToJson($element), [
'element_type' => 'PreserveText',
'font_style' => $fontStyle ? $this->fontStyleToJson($fontStyle) : null,
'paragraph_style' => $paragraphStyle ? $this->paragraphStyleToJson($paragraphStyle) : null,
'text' => $text,
'hash' => $hash,
]);
}
protected function pageBreakToJson(PageBreak $element)
{
return [
'element_type' => 'PageBreak',
];
}
protected function textToJson(Text $element)
{
$fontStyle = $element->getFontStyle();
$paragraphStyle = $element->getParagraphStyle();
$text = $element->getText();
$hash = $this->addText($text);
return array_merge($this->elementToJson($element), [
'element_type' => 'Text',
'font_style' => $fontStyle ? $this->fontStyleToJson($fontStyle) : null,
'paragraph_style' => $paragraphStyle ? $this->paragraphStyleToJson($paragraphStyle) : null,
'text' => $text,
'hash' => $hash,
]);
}
protected function textBreakToJson(TextBreak $element)
{
$fontStyle = $element->getFontStyle();
$paragraphStyle = $element->getParagraphStyle();
return array_merge($this->elementToJson($element), [
'element_type' => 'TextBreak',
'font_style' => $fontStyle ? $this->fontStyleToJson($fontStyle) : null,
'paragraph_style' => $paragraphStyle ? $this->paragraphStyleToJson($paragraphStyle) : null,
]);
}
protected function textRunToJson(TextRun $element)
{
$paragraphStyle = $element->getParagraphStyle();
return array_merge($this->elementToJson($element), [
'element_type' => 'TextRun',
'paragraph_style' => $paragraphStyle ? $this->paragraphStyleToJson($paragraphStyle) : null,
'elements' => $this->elementsToJson($element->getElements()),
]);
}
protected function tableToJson(Table $element)
{
$self = $this;
return array_merge($this->elementToJson($element), [
'element_type' => 'Table',
'style' => $this->tableStyleToJson($element->getStyle()),
'rows' => array_map(function($row) use ($self) { return $self->rowToJson($row); }, $element->getRows()),
'width' => $element->getWidth(),
]);
}
protected function titleToJson(Title $element)
{
$elements = [];
$text = $element->getText();
if (is_object($text)) {
$elements = [$this->textRunToJson($text)];
$text = '';
}
$result = array_merge($this->elementToJson($element), [
'element_type' => 'Title',
'depth' => $element->getDepth(),
'style' => $element->getStyle(),
'text' => $text,
'elements' => $elements,
]);
if ($text) {
$hash = $this->addText($text);
$result['hash'] = $hash;
}
return $result;
}
protected function rowToJson($row)
{
$self = $this;
return [
'height' => $row->getHeight(),
'style' => $this->rowStyleToJson($row->getStyle()),
'cells' => array_map(function($cell) use ($self) {
return $self->cellToJson($cell);
}, $row->getCells()),
];
}
protected function cellToJson($cell)
{
return [
'style' => $this->cellStyleToJson($cell->getStyle()),
'width' => $cell->getWidth(),
'elements' => $this->elementsToJson($cell->getElements()),
];
}
// Styles
protected function borderStyleToJson($style)
{
return [
'style' => 'border',
'BorderTopSize' => $style->getBorderTopSize(),
'BorderTopColor' => $style->getBorderTopColor(),
'BorderTopStyle' => $style->getBorderTopStyle(),
'BorderLeftSize' => $style->getBorderLeftSize(),
'BorderLeftColor' => $style->getBorderLeftColor(),
'BorderLeftStyle' => $style->getBorderLeftStyle(),
'BorderRightSize' => $style->getBorderRightSize(),
'BorderRightColor' => $style->getBorderRightColor(),
'BorderRightStyle' => $style->getBorderRightStyle(),
'BorderBottomSize' => $style->getBorderBottomSize(),
'BorderBottomColor' => $style->getBorderBottomColor(),
'BorderBottomStyle' => $style->getBorderBottomStyle(),
];
}
protected function cellStyleToJson(Style\Cell $style)
{
$styles = array_merge($this->borderStyleToJson($style), [
'style' => 'cell',
'TextDirection' => $style->getTextDirection(),
'BgColor' => $style->getBgColor(),
'GridSpan' => $style->getGridSpan(),
'VMerge' => $style->getVMerge(),
'Shading' => $this->shadingStyleToJson($style->getShading()),
'Width' => $style->getWidth(),
'Unit' => $style->getUnit(),
]);
if ($vAlign = $style->getVAlign()) {
$styles['VAlign'] = $vAlign;
}
return $styles;
}
protected function sectionStyleToJson(Style\Section $style)
{
$styles = [
'style' => 'section',
'BreakType' => $style->getBreakType(),
'ColsNum' => $style->getColsNum(),
'ColsSpace' => $style->getColsSpace(),
'FooterHeight' => $style->getFooterHeight(),
'Gutter' => $style->getGutter(),
'HeaderHeight' => $style->getHeaderHeight(),
'LineNumbering' => $style->getLineNumbering(),
'MarginBottom' => $style->getMarginBottom(),
'MarginLeft' => $style->getMarginLeft(),
'MarginRight' => $style->getMarginRight(),
'MarginTop' => $style->getMarginTop(),
'Orientation' => $style->getOrientation(),
'PageNumberingStart' => $style->getPageNumberingStart(),
'PageSizeH' => $style->getPageSizeH(),
'PageSizeW' => $style->getPageSizeW(),
'PaperSize' => $style->getPaperSize(),
];
$vAlign = $style->getVAlign();
if ($vAlign) {
$styles['VAlign'] = $vAlign;
}
return $styles;
}
protected function shadingStyleToJson($style)
{
if ( ! $style) {
return null;
}
return [
'style' => 'shading',
'pattern' => $style->getPattern(),
'color' => $style->getColor(),
'fill' => $style->getFill(),
];
}
protected function lineStyleToJson(Style\Line $style)
{
return array_merge($this->imageStyleToJson($style), [
'style' => 'line',
'BeginArrow' => $style->getBeginArrow(),
'Color' => $style->getColor(),
'ConnectorType' => $style->getConnectorType(),
'Dash' => $style->getDash(),
'EndArrow' => $style->getEndArrow(),
'Flip' => $style->isFlip(),
'Weight' => $style->getWeight(),
]);
}
protected function listItemStyleToJson(Style\ListItem $style)
{
return [
'style' => 'line_item',
'ListType' => $style->getListType(),
'NumStyle' => $style->getNumStyle(),
];
}
protected function fontStyleToJson(Style\Font $style)
{
$styles = [
'style' => 'font',
'StyleName' => $style->getStyleName(),
'Name' => $style->getName(),
'Size' => $style->getSize(),
'Color' => $style->getColor(),
'Hint' => $style->getHint(),
'Bold' => $style->isBold(),
'Italic' => $style->isItalic(),
'Underline' => $style->getUnderline(),
'Strikethrough' => $style->isStrikethrough(),
'DoubleStrikethrough' => $style->isDoubleStrikethrough(),
'SuperScript' => $style->isSuperScript(),
'SubScript' => $style->isSubScript(),
'SmallCaps' => $style->isSmallCaps(),
'AllCaps' => $style->isAllCaps(),
'FgColor' => $style->getFgColor(),
'Hidden' => $style->isHidden(),
'Type' => $style->getStyleType(),
'Scale' => $style->getScale(),
'Spacing' => $style->getSpacing(),
'Kerning' => $style->getKerning(),
'Position' => $style->getPosition(),
];
if ($style->getParagraph()) {
$styles['Paragraph'] = $this->paragraphStyleToJson($style->getParagraph());
}
return $styles;
}
protected function frameStyleToJson(Style\Frame $style)
{
return [
'style' => 'frame',
'Alignment' => $style->getAlignment(),
'Height' => $style->getHeight(),
'Left' => $style->getLeft(),
'HPos' => $style->getHPos(),
'HPosRelTo' => $style->getHPosRelTo(),
'Pos' => $style->getPos(),
'VPos' => $style->getVPos(),
'VPosRelTo' => $style->getVPosRelTo(),
'Position' => $style->getPosition(),
'Top' => $style->getTop(),
'Unit' => $style->getUnit(),
'Width' => $style->getWidth(),
'Wrap' => $style->getWrap(),
'WrapDistanceBottom' => $style->getWrapDistanceBottom(),
'WrapDistanceLeft' => $style->getWrapDistanceLeft(),
'WrapDistanceRight' => $style->getWrapDistanceRight(),
'WrapDistanceTop' => $style->getWrapDistanceTop(),
];
}
protected function imageStyleToJson(Style\Image $style)
{
return array_merge($this->frameStyleToJson($style), [
'style' => 'image',
'MarginLeft' => $style->getMarginLeft(),
'MarginTop' => $style->getMarginTop(),
'WrappingStyle' => $style->getWrappingStyle(),
'Positioning' => $style->getPositioning(),
'PosHorizontal' => $style->getPosHorizontal(),
'PosHorizontalRel' => $style->getPosHorizontalRel(),
'PosVertical' => $style->getPosVertical(),
'PosVerticalRel' => $style->getPosVerticalRel(),
]);
}
protected function indentationStyleToJson($style)
{
if ( ! $style) {
return null;
}
return [
'style' => 'indentation',
'Left' => $style->getLeft(),
'Right' => $style->getRight(),
'FirstLine' => $style->getFirstLine(),
'Hanging' => $style->getHanging(),
];
}
protected function spacingStyleToJson(Style\Spacing $style)
{
return [
'style' => 'spacing',
'Before' => $style->getBefore(),
'After' => $style->getAfter(),
'Line' => $style->getLine(),
'LineRule' => $style->getLineRule(),
];
}
protected function numberingStyleToJson(Style\Numbering $style)
{
$self = $this;
return [
'style' => 'numbering',
'NumId' => $style->getNumId(),
'Type' => $style->getType(),
'StyleName' => $style->getStyleName(),
'Index' => $style->getIndex(),
'Levels' => array_map(function ($numberingLevel) use ($self) {
return $self->numberingLevelStyleToJson($numberingLevel);
}, $style->getLevels()),
];
}
protected function numberingLevelStyleToJson(Style\NumberingLevel $style)
{
return [
'type' => 'numbering_level',
'Level' => $style->getLevel(),
'Start' => $style->getStart(),
'Format' => $style->getFormat(),
'Restart' => $style->getRestart(),
'PStyle' => $style->getPStyle(),
'Suffix' => $style->getSuffix(),
'Text' => $style->getText(),
'Alignment' => $style->getAlignment(),
'Left' => $style->getLeft(),
'Hanging' => $style->getHanging(),
'TabPos' => $style->getTabPos(),
'Font' => $style->getFont(),
'Hint' => $style->getHint(),
];
}
protected function paragraphStyleToJson(Style\Paragraph $style)
{
$styles = [
'Name' => $style->getStyleName(),
'BasedOn' => $style->getBasedOn(),
'Next' => $style->getNext(),
'Alignment' => $style->getAlignment(),
'Indentation' => $style->getIndentation(),
'Spacing' => $style->getSpacing(),
'WidowControl' => $style->hasWidowControl(),
'KeepNext' => $style->isKeepNext(),
'KeepLines' => $style->isKeepLines(),
'PageBreakBefore' => $style->hasPageBreakBefore(),
'NumStyle' => $style->getNumStyle(),
'NumLevel' => $style->getNumLevel(),
'Tabs' => $style->getTabs(),
'Shading' => $style->getShading(),
'ContextualSpacing' => $style->hasContextualSpacing(),
'Bidi' => $style->isBidi(),
'TextAlignment' => $style->getTextAlignment(),
'SuppressAutoHyphens' => $style->hasSuppressAutoHyphens(),
];
$styles['style'] = 'paragraph';
if ( ! $styles['Alignment']) {
$styles['Alignment'] = 'baseline';
}
if ( ! $styles['TextAlignment']) {
$styles['TextAlignment'] = 'baseline';
}
if ($styles['Indentation']) {
$styles['Indentation'] = $this->indentationStyleToJson($styles['Indentation']);
}
if ($styles['Spacing']) {
$styles['Spacing'] = $this->spacingStyleToJson($styles['Spacing']);
}
return $styles;
}
protected function tableStyleToJson($style)
{
if ( ! $style) {
return [];
}
if (is_string($style)) {
return $style;
}
return array_merge(
$this->borderStyleToJson($style),
[
'style' => 'table',
'BgColor' => $style->getBgColor(),
'CellSpacing' => $style->getCellSpacing(),
'Shading' => $style->getShading(),
'Alignment' => $style->getAlignment(),
'Width' => $style->getWidth(),
'Unit' => $style->getUnit(),
'Layout' => $style->getLayout(),
'ColumnWidths' => $style->getColumnWidths(),
'BidiVisual' => $style->isBidiVisual(),
'position' => $this->tablePositionStyleToJson($style->getPosition()),
'first_row' => $this->tableStyleToJson($style->getFirstRow()),
'BorderInsideHSize' => $style->getBorderInsideHSize(),
'BorderInsideHColor' => $style->getBorderInsideHColor(),
'BorderInsideVSize' => $style->getBorderInsideVSize(),
'BorderInsideVColor' => $style->getBorderInsideVColor(),
'CellMarginTop' => $style->getCellMarginTop(),
'CellMarginRight' => $style->getCellMarginRight(),
'CellMarginLeft' => $style->getCellMarginLeft(),
'CellMarginBottom' => $style->getCellMarginBottom(),
]
);
}
protected function tablePositionStyleToJson($style)
{
if ( ! $style) {
return [];
}
return [
'style' => 'table_position',
'LeftFromText' => $style->getLeftFromText(),
'RightFromText' => $style->getRightFromText(),
'TopFromText' => $style->getTopFromText(),
'BottomFromText' => $style->getBottomFromText(),
'VertAnchor' => $style->getVertAnchor(),
'HorzAnchor' => $style->getHorzAnchor(),
'TblpXSpec' => $style->getTblpXSpec(),
'TblpX' => $style->getTblpX(),
'TblpYSpec' => $style->getTblpYSpec(),
'TblpY' => $style->getTblpY(),
];
}
protected function rowStyleToJson($style)
{
return [
'style' => 'row',
'TblHeader' => $style->isTblHeader(),
'CantSplit' => $style->isCantSplit(),
'ExactHeight' => $style->isExactHeight(),
];
}
protected function addText($text)
{
$hash = $this->generateHash();
$this->textContents['text'] .= $text;
$this->textContents['elements'][] = [
'hash' => $hash,
'range_start' => $this->textLength,
'range_end' => $this->textLength + (strlen($text) > 0 ? strlen($text) - 1 : 0),
];
$this->textLength = $this->textLength + (strlen($text) > 0 ? strlen($text) : 1);
return $hash;
}
protected function generateHash()
{
return uniqid();
}
}

291
app/Ingest/DocxWriter.php

@ -0,0 +1,291 @@
<?php
namespace App\Ingest;
use PhpOffice\PhpWord\Element\TextRun;
use PhpOffice\PhpWord\IOFactory;
use PhpOffice\PhpWord\PhpWord;
use PhpOffice\PhpWord\Style;
use PhpOffice\PhpWord\Style\Font;
class DocxWriter
{
protected $storage;
protected $saveAtPath;
protected $handler;
public function __construct($storage, $saveAtPath)
{
$this->storage = $storage;
$this->saveAtPath = $saveAtPath;
$this->handler = new PhpWord();
}
public function execute(array $data)
{
$this->handler->setDefaultFontName($data['default_font_name']);
$this->handler->setDefaultFontSize($data['default_font_size']);
$this->setStyles($data['styles']);
foreach ($data['elements'] as $section) {
$this->handleSection($section);
}
$objWriter = IOFactory::createWriter($this->handler, 'Word2007');
$objWriter->save($this->storage->path($this->saveAtPath));
}
protected function handleSection($element)
{
$section = $this->handler->addSection($element['style']);
foreach ($element['headers'] as $header) {
$this->handleHeader($header, $section);
}
foreach ($element['footers'] as $footer) {
$this->handleFooter($footer, $section);
}
$this->addElementsToElement($section, $element['elements']);
}
protected function handleHeader($header, $section)
{
$headerElement = $section->addHeader($header['type']);
$this->addElementsToElement($headerElement, $header['elements']);
return $headerElement;
}
protected function handleFooter($footer, $section)
{
$footerElement = $section->addHeader($footer['type']);
$this->addElementsToElement($footerElement, $footer['elements']);
return $footerElement;
}
protected function addElementsToElement($parentElement, $elements)
{
foreach ($elements as $element) {
$method = 'handle' . $element['element_type'];
$this->$method($parentElement, $element);
}
}
protected function handleImage($parentElement, array $element)
{
return;
$parentElement->addImage(
$element['source'],
$element['style'],
$element['is_watermark'],
$element['name']
);
}
protected function handleBookmark($parentElement, array $element)
{
$parentElement->addBookmark($element['name']);
}
protected function handleLine($parentElement, array $element)
{
$parentElement->addLine($element['style']);
}
protected function handleLink($parentElement, array $element)
{
}
protected function handleListItem($parentElement, array $element)
{
$data = array_key_exists('text_object', $element) ? $element['text_object'] : $element;
$parentElement->addListItem(
$data['text'],
$element['depth'],
$data['font_style'],
$element['style'],
$data['paragraph_style']
);
}
protected function handleListItemRun($parentElement, array $element)
{
$createdElement = $parentElement->addListItemRun(
$element['depth'],
$element['style'],
$element['paragraph_style']
);
if ($createdElement) {
$this->addElementsToElement($createdElement, $element['elements']);
}
}
protected function handlePageBreak($parentElement, array $element)
{
$parentElement->addPageBreak();
}
protected function handlePreserveText($parentElement, array $element)
{
$parentElement->addPreserveText(
$element['text'],
$element['font_style'],
$element['paragraph_style']
);
}
protected function handleText($parentElement, array $element)
{
// @TODO Improve bold, italic, list items styles, other styles..
$parentElement->addText(
$element['text'],
$element['font_style'],
$element['paragraph_style']
);
}
protected function handleTextBreak($parentElement, array $element)
{
$fontStyle = null;
if ($fs = $element['font_style']) {
$paragraphStyle = array_key_exists('Paragraph', $fs) ? $fs['Paragraph'] : null;
$fontStyle = new Font($fs['StyleName'], $paragraphStyle);
// Basic
$fontStyle->setName($fs['Name']);
$fontStyle->setSize($fs['Size']);
$fontStyle->setColor($fs['Color']);
$fontStyle->setHint($fs['Hint']);
// Style
$fontStyle->setBold($fs['Bold']);
$fontStyle->setItalic($fs['Italic']);
$fontStyle->setUnderline($fs['Underline']);
$fontStyle->setStrikethrough($fs['Strikethrough']);
$fontStyle->setDoubleStrikethrough($fs['DoubleStrikethrough']);
$fontStyle->setSuperScript($fs['SuperScript']);
$fontStyle->setSubScript($fs['SubScript']);
$fontStyle->setSmallCaps($fs['SmallCaps']);
$fontStyle->setAllCaps($fs['AllCaps']);
$fontStyle->setFgColor($fs['FgColor']);
$fontStyle->setHidden($fs['Hidden']);
// Spacing
$fontStyle->setScale($fs['Scale']);
$fontStyle->setSpacing($fs['Spacing']);
$fontStyle->setKerning($fs['Kerning']);
$fontStyle->setPosition($fs['Position']);
}
$parentElement->addTextBreak(
1,
$fontStyle,
$element['paragraph_style']
);
}
protected function handleTextRun($parentElement, array $element)
{
// $createdElement = $parentElement->addTextRun($element['paragraph_style']);
$createdElement = $parentElement->addTextRun();
if ($createdElement) {
$this->addElementsToElement($createdElement, $element['elements']);
}
}
protected function handleTable($parentElement, array $element)
{
$table = $parentElement->addTable($element['style']);
$table->setWidth($element['width']);
foreach ($element['rows'] as $row) {
$addedRow = $table->addRow($row['height'], $row['style']);
foreach ($row['cells'] as $cell) {
$addedCell = $addedRow->addCell($cell['width'], $cell['style']);
if (count($cell['elements']) > 0) {
$this->addElementsToElement($addedCell, $cell['elements']);
}
}
}
}
protected function handleTitle($parentElement, array $element)
{
if (is_array($element['text'])) {
$textRun = new TextRun($element['text']['paragraph_style']);
$this->addElementsToElement($textRun, $element['text']['elements']);
$text = $textRun;
} else {
$text = $element['text'];
}
return $parentElement->addTitle($text, $element['depth']);
}
protected function setStyles($styles)
{
foreach ($styles as $name => $style) {
if ($name == 'Title') {
Style::addTitleStyle(null, $style, $style['Paragraph']);
continue;
}
if (strpos($name, 'Heading_') !== false) {
$name = str_replace('Heading_', '', $name);
$depth = (int) $name;
Style::addTitleStyle($depth, $style, $style['Paragraph']);
continue;
}
if ($style['style'] === 'font') {
$paragraphStyle = isset($style['Paragraph']) ? $style['Paragraph'] : null;
Style::addFontStyle($name, $style, $paragraphStyle);
}
if ($style['style'] === 'paragraph') {
Style::addParagraphStyle($name, $style);
}
if ($style['style'] === 'link') {
// Style::addLinkStyle();
}
if ($style['style'] === 'numbering') {
// $style['Levels'][0]['Format'] = 'decimal';
Style::addNumberingStyle($name, $style);
}
if ($style['style'] === 'title') {
// Style::addTitleStyle();
}
if ($style['style'] === 'table') {
// Style::addTableStyle();
}
}
}
}

2
app/Ingest/OCR.php

@ -28,7 +28,7 @@ class OCR
protected function preProcess()
{
$this->applyDewarp();
$this->applyDeskew();
// $this->applyDeskew();
}
protected function applyDewarp()

8
app/Ingest/PDFConvertor.php

@ -88,6 +88,7 @@ class PDFConvertor extends AbstractConvertor
$imagesInFooter = true;
$mdContents = '';
$htmlContents = '';
try {
foreach ($orderedList as $page) {
@ -106,12 +107,14 @@ class PDFConvertor extends AbstractConvertor
if ($textContents) {
if ($html) {
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n";
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n";
$htmlContents = $htmlContents . $html;
$html = '';
}
$mdContents = $mdContents . $textContents . "\n";
$mdContents = $mdContents . $textContents . "\n\n";
$htmlContents = $htmlContents . "<div>$textContents</div>";
$this->storage->delete($imageFilePath);
@ -151,6 +154,7 @@ class PDFConvertor extends AbstractConvertor
}
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n";
$htmlContents = $htmlContents . "<html><head></head><body>$html</body></html>";
}
} catch (\Exception $exception) {
$this->storage->deleteDirectory($this->directoryPath);

79
app/Jobs/IngestDocuments.php

@ -3,6 +3,8 @@
namespace App\Jobs;
use App\Ingest\Convertor;
use App\Ingest\DataJsonConvertor;
use App\Ingest\DocxReader;
use App\Parser\ParseXml;
use App\Parser\DocxParser\ParseDocx;
use App\Parser\HtmlParser\ParseHtml;
@ -21,7 +23,8 @@ class IngestDocuments implements ShouldQueue
use Dispatchable, InteractsWithQueue, Queueable;
protected $id;
private $path;
protected $fileResultType;
protected $path;
protected $type;
protected $fromRequest;
@ -30,46 +33,24 @@ class IngestDocuments implements ShouldQueue
*/
private $storage;
/**
* @var \App\Parser\DocxParser\ParseDocx
*/
private $parserDocx;
/**
* @var \App\Parser\ParseXml
*/
private $parserXml;
/**
* @var \App\Parser\HtmlParser\ParseHtml
*/
private $parserHtml;
/**
* @var \App\Parser\ParseHtmlArray
*/
private $parseHtmlArray;
/**
* Create a new job instance.
*
* @param $id
* @param $fileResultType
* @param string $path
* @param $type
* @param $fromRequest
*/
public function __construct($id, string $path, $type, $fromRequest)
public function __construct($id, $fileResultType, string $path, $type, $fromRequest)
{
$this->id = $id;
$this->fileResultType = $fileResultType;
$this->path = $path;
$this->type = $type;
$this->fromRequest = $fromRequest;
$this->storage = Storage::disk('local');
$this->parserDocx = new ParseDocx();
$this->parserXml = new ParseXml();
$this->parserHtml = new ParseHtml();
$this->parseHtmlArray = new ParseHtmlArray();
}
/**
@ -79,12 +60,13 @@ class IngestDocuments implements ShouldQueue
*/
public function handle()
{
$convertor = new Convertor($this->path, $this->type);
try {
$convertor->execute();
$this->execute();
} catch (\Exception $exception) {
\Illuminate\Support\Facades\Log::info('=============== IngestDocuments@handle');
\Illuminate\Support\Facades\Log::info($exception->getMessage());
\Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
\Illuminate\Support\Facades\Log::info('=============== ');
$this->failed();
@ -94,7 +76,7 @@ class IngestDocuments implements ShouldQueue
$directoryPath = pathinfo($this->path, PATHINFO_DIRNAME);
if ($this->fromRequest) {
SendToCore::dispatch($this->id, $directoryPath);
SendToCore::dispatch($this->id, $this->fileResultType, $directoryPath);
return;
}
@ -104,6 +86,41 @@ class IngestDocuments implements ShouldQueue
$this->updateAnalyzer();
}
protected function execute()
{
if ($this->fileResultType === 'md') {
$this->convertToMD();
return;
}
$this->convertToJsonData();
}
/**
* Convert document to plain MD file which is easy to work with.
*
* @throws \Exception
*/
protected function convertToMD()
{
$convertor = new Convertor($this->path, $this->type);
$convertor->execute();
}
/**
* Convert document to JSON data file.
*
* @throws \Exception
*/
protected function convertToJsonData()
{
$convertor = new DataJsonConvertor($this->path, $this->type);
$convertor->execute();
}
public function failed()
{
if ( ! $this->storage) {
@ -115,7 +132,7 @@ class IngestDocuments implements ShouldQueue
$directoryPath = pathinfo($this->path, PATHINFO_DIRNAME);
if ($this->fromRequest) {
SendToCore::dispatch($this->id, $directoryPath, true);
SendToCore::dispatch($this->id, $this->fileResultType, $directoryPath, true);
return;
}

131
app/Jobs/RecreateDocument.php

@ -0,0 +1,131 @@
<?php
namespace App\Jobs;
use App\Ingest\DocxWriter;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use Spatie\WebhookServer\WebhookCall;
class RecreateDocument implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
protected $id;
protected $data;
protected $storage;
protected $url;
protected $secret;
/**
* Create a new job instance.
*
* @return void
*/
public function __construct($id, $data)
{
$this->id = $id;
$this->data = $data;
$this->storage = Storage::disk('local');
$this->url = env('WEBHOOK_CORE_URL') . '/webhooks';
$this->secret = env('WEBHOOK_CORE_SECRET');
}
/**
* Execute the job.
*
* @return void
*/
public function handle()
{
try {
$this->setupData();
$this->createDocx();
// Convert to original format, either PDF, ODT, etc.
$this->sendResponse('success');
} catch (\Exception $exception) {
\Illuminate\Support\Facades\Log::info('RecreateDocument@handle: ' . $exception->getMessage());
\Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
$this->sendResponse('fail');
}
}
protected function setupData()
{
$text = $this->data['contents']['text'];
$textMapper = [];
foreach ($this->data['contents']['elements'] as $element) {
$textMapper[$element['hash']] = substr(
$text,
$element['range_start'],
$element['range_end'] - $element['range_start'] + 1
);
}
$this->data['elements'] = $this->updateText($this->data['elements'], $textMapper);
}
protected function updateText($elements, $textMapper)
{
foreach ($elements as $index => $element) {
if (array_key_exists('hash', $element)) {
$elements[$index]['text'] = $textMapper[$element['hash']];
}
if (
array_key_exists('text_object', $element) &&
array_key_exists('text', $element['text_object'])
) {
$elements[$index]['text_object']['text'] = $textMapper[$element['text_object']['hash']];
}
if (isset($elements[$index]['elements'])) {
$elements[$index]['elements'] = $this->updateText($elements[$index]['elements'], $textMapper);
}
}
return $elements;
}
protected function createDocx()
{
$path = 'contracts/' . $this->id . '-document.docx';
$writer = new DocxWriter($this->storage, $path);
$writer->execute($this->data);
}
protected function sendResponse($status)
{
try {
WebhookCall::create()
->url($this->url)
->payload(['data' => [
'id' => $this->id,
'content' => '',
'file_result_type' => 'document-recreated',
'status' => $status,
]])
->useSecret($this->secret)
->dispatch();
return true;
} catch (\Exception $exception) {
Log::error('RecreateDocument@sendDocument: ' . $exception->getMessage());
return false;
}
}
}

53
app/Jobs/SendToCore.php

@ -14,14 +14,11 @@ class SendToCore implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable;
private $url;
private $secret;
private $directoryPath;
private $id;
protected $url;
protected $secret;
protected $directoryPath;
protected $fileResultType;
protected $id;
protected $hasFailed;
/**
@ -33,16 +30,18 @@ class SendToCore implements ShouldQueue
* Create a new job instance.
*
* @param $id
* @param string $fileResultType
* @param null $directoryPath
* @param bool $hasFailed
*/
public function __construct($id, $directoryPath = null, $hasFailed = false)
public function __construct($id, $fileResultType, $directoryPath = null, $hasFailed = false)
{
$this->url = env('WEBHOOK_CORE_URL') . '/webhooks';
$this->secret = env('WEBHOOK_CORE_SECRET');
$this->id = $id;
$this->directoryPath = $directoryPath;
$this->fileResultType = $fileResultType;
$this->hasFailed = $hasFailed;
}
@ -94,7 +93,7 @@ class SendToCore implements ShouldQueue
/**
* Send the data to the core through webhooks
*
* @param $content
* @param array$content
* @return bool
*/
protected function sendTheData(array $content)
@ -105,6 +104,7 @@ class SendToCore implements ShouldQueue
->payload(['data' => [
'id' => $this->id,
'content' => $content,
'file_result_type' => $this->fileResultType,
'status' => count($content) > 0 ? 'success' : 'fail',
]])
->useSecret($this->secret)
@ -120,24 +120,29 @@ class SendToCore implements ShouldQueue
protected function getContent()
{
$document = $this->storage->get("$this->directoryPath/document.md");
$extension = $this->fileResultType === 'md' ? 'md' : 'json';
$filePath = "$this->directoryPath/document.$extension";
$document = $this->storage->get($filePath);
$document = $this->encodeContent($document);
$images = [];
$allFiles = $this->storage->allFiles($this->directoryPath);
foreach ($allFiles as $file) {
// @TODO We are using this check in the 'PDFConvertor' file, refactor and improve.
if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) {
$name = pathinfo($file, PATHINFO_FILENAME);
$type = pathinfo($file, PATHINFO_EXTENSION);
$images[] = [
'name' => $name,
'type' => $type,
'contents' => 'data:image/' . $type . ';base64,' . base64_encode($this->storage->get($file)),
];
if ($extension === 'md') {
$allFiles = $this->storage->allFiles($this->directoryPath);
foreach ($allFiles as $file) {
// @TODO We are using this check in the 'PDFConvertor' file, refactor and improve.
if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) {
$name = pathinfo($file, PATHINFO_FILENAME);
$type = pathinfo($file, PATHINFO_EXTENSION);
$images[] = [
'name' => $name,
'type' => $type,
'contents' => 'data:image/' . $type . ';base64,' . base64_encode($this->storage->get($file)),
];
}
}
}

3
composer.json

@ -18,7 +18,8 @@
"predis/predis": "^1.1",
"spatie/laravel-webhook-server": "^1.13",
"spatie/pdf-to-text": "^1.3",
"thiagoalessio/tesseract_ocr": "^2.11"
"thiagoalessio/tesseract_ocr": "^2.11",
"ext-json": "*"
},
"require-dev": {
"facade/ignition": "^1.4",

2
routes/web.php

@ -1,6 +1,8 @@
<?php
Route::post('/ingest', 'IngestController@store');
Route::post('/recreate-document', 'RecreateDocumentController@store');
Route::get('/recreate-document/{id}', 'RecreateDocumentController@show');
Route::get('/', function () {
return 1;

30
tests/Feature/IngestDocxTest.php

@ -1,30 +0,0 @@
<?php
namespace Tests\Feature;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Foundation\Testing\WithFaker;
use Illuminate\Support\Facades\Storage;
use Tests\TestCase;
class IngestDocxTest extends TestCase
{
/** @test */
public function it_ingests_a_good_docx_file()
{
try {
$payload = $this->postJson('/word', [
'document' => new \Illuminate\Http\UploadedFile(
storage_path('app/public/ContractING.odt'),
'ContractING good.docx',
'application/msword',
null,
null,
true
),
])->decodeResponseJson();
} catch (\Exception $exception) {
dd($exception->getMessage());
}
}
}

36
tests/Feature/ProcessDocxDocumentTest.php

@ -0,0 +1,36 @@
<?php
namespace Tests\Feature;
use App\Ingest\DocxReader;
use App\Ingest\DocxWriter;
use App\Jobs\RecreateDocument;
use Illuminate\Support\Facades\Storage;
use Tests\TestCase;
class ProcessDocxDocumentTest extends TestCase
{
/** @test */
public function it_reads_docx_documents_content()
{
$storage = Storage::disk('local');
// $reader = new DocxReader($storage, 'contracts/x.docx');
// $reader = new DocxReader($storage, 'contracts/y.docx');
$reader = new DocxReader($storage, 'contracts/z.docx');
$result = $reader->execute();
$writer = new DocxWriter($storage, 'contracts/test-write.docx');
$writer->execute($result);
}
/** @test */
public function it_recreates_original_document_from_json()
{
$data = Storage::disk('local')->get('contracts/x.json');
$data = json_decode($data, true);
$recreateDocument = new RecreateDocument('test123', $data);
$recreateDocument->handle();
}
}
Loading…
Cancel
Save