Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

777 lines
24 KiB

<?php
namespace App\Ingest;
use PhpOffice\PhpWord\Element\AbstractElement;
use PhpOffice\PhpWord\Element\Bookmark;
use PhpOffice\PhpWord\Element\Header;
use PhpOffice\PhpWord\Element\Image;
use PhpOffice\PhpWord\Element\Line;
use PhpOffice\PhpWord\Element\Link;
use PhpOffice\PhpWord\Element\ListItem;
use PhpOffice\PhpWord\Element\ListItemRun;
use PhpOffice\PhpWord\Element\PageBreak;
use PhpOffice\PhpWord\Element\PreserveText;
use PhpOffice\PhpWord\Element\Section;
use PhpOffice\PhpWord\Element\Table;
use PhpOffice\PhpWord\Element\Text;
use PhpOffice\PhpWord\Element\TextBreak;
use PhpOffice\PhpWord\Element\TextRun;
use PhpOffice\PhpWord\Element\Title;
use PhpOffice\PhpWord\IOFactory;
use PhpOffice\PhpWord\Style;
class DocxReader extends AbstractConvertor
{
protected $textContents;
protected $textLength;
// protected $readersMapper;
public function __construct($storage, $path)
{
parent::__construct($storage, $path);
$this->textContents = [
'text' => '',
'elements' => [],
];
$this->textLength = 0;
// $this->readersMapper = [
// 'docx' => 'Word2007',
// 'odt' => 'ODText',
// 'rtf' => 'RTF',
// ];
}
public function execute()
{
// Converting to HTML and then back to DOCX loses some content and styles (lost when converted to HTML).
$data = [];
$handler = IOFactory::load($this->storage->path($this->path));
/**
* @ISSUE
* At the moment of this writing (08/sept/2021) phpword does not support reading bookmarks from the
* DOCX file, in order to add the support we can, for example, add the following lines of
* code to the AbstractPart.php file before the Text and TextRun handling case.
*
* $els = $xmlReader->getElements('w:bookmarkStart', $domNode);
* if ($els && $els->count() > 0) {
* $parent->addBookmark($els[0]->getAttribute('w:name'));
* }
*
*/
$data['default_font_name'] = $handler->getDefaultFontName();
$data['default_font_size'] = $handler->getDefaultFontSize();
$data['styles'] = $this->handleStyles(Style::getStyles());
$elements = [];
$sections = $handler->getSections();
foreach ($sections as $section) {
$elements[] = $this->sectionToJson($section);
}
$data['elements'] = $elements;
$data['contents'] = $this->textContents;
return $data;
}
protected function sectionToJson(Section $section)
{
return array_merge($this->elementToJson($section), [
'element_type' => 'Section',
'elements' => $this->elementsToJson($section->getElements()),
'footers' => $this->handleFooters($section->getFooters()),
// 'footnote_properties' => $this->handleFooters($section->getFootnotePropoperties()),
'headers' => $this->handleHeaders($section->getHeaders()),
'style' => $this->sectionStyleToJson($section->getStyle()),
]);
}
protected function handleStyles($styles)
{
$self = $this;
return array_map(function ($style) use ($self) {
$class = get_class($style);
$a = explode('\\', $class);
$a = array_reverse($a);
$className = $a[0];
$handler = lcfirst($className) . 'StyleToJson';
return $self->$handler($style);
}, $styles);
}
protected function handleHeaders(array $headers)
{
$list = [];
foreach ($headers as $header) {
$list[] = $this->headerToJson($header);
}
return $list;
}
protected function handleFooters(array $footers)
{
$list = [];
foreach ($footers as $footer) {
$list[] = $this->footerToJson($footer);
}
return $list;
}
protected function handleElement($element)
{
$class = get_class($element);
$a = explode('\\', $class);
$a = array_reverse($a);
$className = $a[0];
$handler = lcfirst($className) . 'ToJson';
return array_merge(
$this->elementToJson($element),
$this->$handler($element)
);
}
protected function elementToJson(AbstractElement $element)
{
return [
// 'comment_range_end' => $element->getCommentRangeEnd(),
// 'comment_range_start' => $element->getCommentRangeStart(),
'doc_part' => $element->getDocPart(),
'doc_part_id' => $element->getDocPartId(),
'element_id' => $element->getElementId(),
'element_index' => $element->getElementIndex(),
// 'parent' => $element->getParent(),
'nested_level' => $element->getNestedLevel(),
'relation_id' => $element->getRelationId(),
'section_id' => $element->getSectionId(),
];
}
protected function elementsToJson(array $elements)
{
$list = [];
foreach ($elements as $element) {
$list[] = $this->handleElement($element);
}
return $list;
}
protected function headerToJson(Header $header)
{
return array_merge(
$this->footerToJson($header),
[
'element_type' => 'Header',
]
);
}
protected function footerToJson($footer)
{
return array_merge($this->elementToJson($footer), [
'element_type' => 'Footer',
'elements' => $this->elementsToJson($footer->getElements()),
'type' => $footer->getType(),
]);
}
protected function bookmarkToJson(Bookmark $element)
{
return array_merge($this->elementToJson($element), [
'element_type' => 'Bookmark',
'name' => $element->getName(),
]);
}
protected function imageToJson(Image $image)
{
return array_merge($this->elementToJson($image), [
'element_type' => 'Image',
'name' => $image->getName(),
'style' => $this->imageStyleToJson($image->getStyle()),
'source' => $image->getSource(),
'source_type' => $image->getSourceType(),
'is_watermark' => $image->isWatermark(),
]);
}
protected function lineToJson(Line $element)
{
return array_merge($this->elementToJson($element), [
'element_type' => 'Line',
'style' => $this->lineStyleToJson($element->getStyle()),
]);
}
protected function linkToJson(Link $element)
{
return [
'element_type' => 'Link',
];
}
protected function listItemToJson(ListItem $element)
{
return array_merge($this->elementToJson($element), [
'element_type' => 'ListItem',
'depth' => $element->getDepth(),
'style' => $this->listItemStyleToJson($element->getStyle()),
'text' => $element->getText(),
'text_object' => $this->textToJson($element->getTextObject()),
]);
}
protected function listItemRunToJson(ListItemRun $element)
{
return array_merge($this->textRunToJson($element), [
'element_type' => 'ListItemRun',
'depth' => $element->getDepth(),
'style' => $this->listItemStyleToJson($element->getStyle()),
]);
}
protected function preserveTextToJson(PreserveText $element)
{
$fontStyle = $element->getFontStyle();
$paragraphStyle = $element->getParagraphStyle();
$text = $element->getText();
$text = is_array($text) ? $text[0] : $text;
$hash = $this->addText($text);
return array_merge($this->elementToJson($element), [
'element_type' => 'PreserveText',
'font_style' => $fontStyle ? $this->fontStyleToJson($fontStyle) : null,
'paragraph_style' => $paragraphStyle ? $this->paragraphStyleToJson($paragraphStyle) : null,
'text' => $text,
'hash' => $hash,
]);
}
protected function pageBreakToJson(PageBreak $element)
{
return [
'element_type' => 'PageBreak',
];
}
protected function textToJson(Text $element)
{
$fontStyle = $element->getFontStyle();
$paragraphStyle = $element->getParagraphStyle();
$text = $element->getText();
$hash = $this->addText($text);
return array_merge($this->elementToJson($element), [
'element_type' => 'Text',
'font_style' => $fontStyle ? $this->fontStyleToJson($fontStyle) : null,
'paragraph_style' => $paragraphStyle ? $this->paragraphStyleToJson($paragraphStyle) : null,
'text' => $text,
'hash' => $hash,
]);
}
protected function textBreakToJson(TextBreak $element)
{
$fontStyle = $element->getFontStyle();
$paragraphStyle = $element->getParagraphStyle();
return array_merge($this->elementToJson($element), [
'element_type' => 'TextBreak',
'font_style' => $fontStyle ? $this->fontStyleToJson($fontStyle) : null,
'paragraph_style' => $paragraphStyle ? $this->paragraphStyleToJson($paragraphStyle) : null,
]);
}
protected function textRunToJson(TextRun $element)
{
$paragraphStyle = $element->getParagraphStyle();
return array_merge($this->elementToJson($element), [
'element_type' => 'TextRun',
'paragraph_style' => $paragraphStyle ? $this->paragraphStyleToJson($paragraphStyle) : null,
'elements' => $this->elementsToJson($element->getElements()),
]);
}
protected function tableToJson(Table $element)
{
$self = $this;
return array_merge($this->elementToJson($element), [
'element_type' => 'Table',
'style' => $this->tableStyleToJson($element->getStyle()),
'rows' => array_map(function($row) use ($self) { return $self->rowToJson($row); }, $element->getRows()),
'width' => $element->getWidth(),
]);
}
protected function titleToJson(Title $element)
{
$elements = [];
$text = $element->getText();
if (is_object($text)) {
$elements = [$this->textRunToJson($text)];
$text = '';
}
$result = array_merge($this->elementToJson($element), [
'element_type' => 'Title',
'depth' => $element->getDepth(),
'style' => $element->getStyle(),
'text' => $text,
'elements' => $elements,
]);
if ($text) {
$hash = $this->addText($text);
$result['hash'] = $hash;
}
return $result;
}
protected function rowToJson($row)
{
$self = $this;
return [
'height' => $row->getHeight(),
'style' => $this->rowStyleToJson($row->getStyle()),
'cells' => array_map(function($cell) use ($self) {
return $self->cellToJson($cell);
}, $row->getCells()),
];
}
protected function cellToJson($cell)
{
return [
'style' => $this->cellStyleToJson($cell->getStyle()),
'width' => $cell->getWidth(),
'elements' => $this->elementsToJson($cell->getElements()),
];
}
// Styles
protected function borderStyleToJson($style)
{
return [
'style' => 'border',
'BorderTopSize' => $style->getBorderTopSize(),
'BorderTopColor' => $style->getBorderTopColor(),
'BorderTopStyle' => $style->getBorderTopStyle(),
'BorderLeftSize' => $style->getBorderLeftSize(),
'BorderLeftColor' => $style->getBorderLeftColor(),
'BorderLeftStyle' => $style->getBorderLeftStyle(),
'BorderRightSize' => $style->getBorderRightSize(),
'BorderRightColor' => $style->getBorderRightColor(),
'BorderRightStyle' => $style->getBorderRightStyle(),
'BorderBottomSize' => $style->getBorderBottomSize(),
'BorderBottomColor' => $style->getBorderBottomColor(),
'BorderBottomStyle' => $style->getBorderBottomStyle(),
];
}
protected function cellStyleToJson(Style\Cell $style)
{
$styles = array_merge($this->borderStyleToJson($style), [
'style' => 'cell',
'TextDirection' => $style->getTextDirection(),
'BgColor' => $style->getBgColor(),
'GridSpan' => $style->getGridSpan(),
'VMerge' => $style->getVMerge(),
'Shading' => $this->shadingStyleToJson($style->getShading()),
'Width' => $style->getWidth(),
'Unit' => $style->getUnit(),
]);
if ($vAlign = $style->getVAlign()) {
$styles['VAlign'] = $vAlign;
}
return $styles;
}
protected function sectionStyleToJson(Style\Section $style)
{
$styles = [
'style' => 'section',
'BreakType' => $style->getBreakType(),
'ColsNum' => $style->getColsNum(),
'ColsSpace' => $style->getColsSpace(),
'FooterHeight' => $style->getFooterHeight(),
'Gutter' => $style->getGutter(),
'HeaderHeight' => $style->getHeaderHeight(),
'LineNumbering' => $style->getLineNumbering(),
'MarginBottom' => $style->getMarginBottom(),
'MarginLeft' => $style->getMarginLeft(),
'MarginRight' => $style->getMarginRight(),
'MarginTop' => $style->getMarginTop(),
'Orientation' => $style->getOrientation(),
'PageNumberingStart' => $style->getPageNumberingStart(),
'PageSizeH' => $style->getPageSizeH(),
'PageSizeW' => $style->getPageSizeW(),
'PaperSize' => $style->getPaperSize(),
];
$vAlign = $style->getVAlign();
if ($vAlign) {
$styles['VAlign'] = $vAlign;
}
return $styles;
}
protected function shadingStyleToJson($style)
{
if ( ! $style) {
return null;
}
return [
'style' => 'shading',
'pattern' => $style->getPattern(),
'color' => $style->getColor(),
'fill' => $style->getFill(),
];
}
protected function lineStyleToJson(Style\Line $style)
{
return array_merge($this->imageStyleToJson($style), [
'style' => 'line',
'BeginArrow' => $style->getBeginArrow(),
'Color' => $style->getColor(),
'ConnectorType' => $style->getConnectorType(),
'Dash' => $style->getDash(),
'EndArrow' => $style->getEndArrow(),
'Flip' => $style->isFlip(),
'Weight' => $style->getWeight(),
]);
}
protected function listItemStyleToJson(Style\ListItem $style)
{
return [
'style' => 'line_item',
'ListType' => $style->getListType(),
'NumStyle' => $style->getNumStyle(),
];
}
protected function fontStyleToJson(Style\Font $style)
{
$styles = [
'style' => 'font',
'StyleName' => $style->getStyleName(),
'Name' => $style->getName(),
'Size' => $style->getSize(),
'Color' => $style->getColor(),
'Hint' => $style->getHint(),
'Bold' => $style->isBold(),
'Italic' => $style->isItalic(),
'Underline' => $style->getUnderline(),
'Strikethrough' => $style->isStrikethrough(),
'DoubleStrikethrough' => $style->isDoubleStrikethrough(),
'SuperScript' => $style->isSuperScript(),
'SubScript' => $style->isSubScript(),
'SmallCaps' => $style->isSmallCaps(),
'AllCaps' => $style->isAllCaps(),
'FgColor' => $style->getFgColor(),
'Hidden' => $style->isHidden(),
'Type' => $style->getStyleType(),
'Scale' => $style->getScale(),
'Spacing' => $style->getSpacing(),
'Kerning' => $style->getKerning(),
'Position' => $style->getPosition(),
];
if ($style->getParagraph()) {
$styles['Paragraph'] = $this->paragraphStyleToJson($style->getParagraph());
}
return $styles;
}
protected function frameStyleToJson(Style\Frame $style)
{
return [
'style' => 'frame',
'Alignment' => $style->getAlignment(),
'Height' => $style->getHeight(),
'Left' => $style->getLeft(),
'HPos' => $style->getHPos(),
'HPosRelTo' => $style->getHPosRelTo(),
'Pos' => $style->getPos(),
'VPos' => $style->getVPos(),
'VPosRelTo' => $style->getVPosRelTo(),
'Position' => $style->getPosition(),
'Top' => $style->getTop(),
'Unit' => $style->getUnit(),
'Width' => $style->getWidth(),
'Wrap' => $style->getWrap(),
'WrapDistanceBottom' => $style->getWrapDistanceBottom(),
'WrapDistanceLeft' => $style->getWrapDistanceLeft(),
'WrapDistanceRight' => $style->getWrapDistanceRight(),
'WrapDistanceTop' => $style->getWrapDistanceTop(),
];
}
protected function imageStyleToJson(Style\Image $style)
{
return array_merge($this->frameStyleToJson($style), [
'style' => 'image',
'MarginLeft' => $style->getMarginLeft(),
'MarginTop' => $style->getMarginTop(),
'WrappingStyle' => $style->getWrappingStyle(),
'Positioning' => $style->getPositioning(),
'PosHorizontal' => $style->getPosHorizontal(),
'PosHorizontalRel' => $style->getPosHorizontalRel(),
'PosVertical' => $style->getPosVertical(),
'PosVerticalRel' => $style->getPosVerticalRel(),
]);
}
protected function indentationStyleToJson($style)
{
if ( ! $style) {
return null;
}
return [
'style' => 'indentation',
'Left' => $style->getLeft(),
'Right' => $style->getRight(),
'FirstLine' => $style->getFirstLine(),
'Hanging' => $style->getHanging(),
];
}
protected function spacingStyleToJson(Style\Spacing $style)
{
return [
'style' => 'spacing',
'Before' => $style->getBefore(),
'After' => $style->getAfter(),
'Line' => $style->getLine(),
'LineRule' => $style->getLineRule(),
];
}
protected function numberingStyleToJson(Style\Numbering $style)
{
$self = $this;
return [
'style' => 'numbering',
'NumId' => $style->getNumId(),
'Type' => $style->getType(),
'StyleName' => $style->getStyleName(),
'Index' => $style->getIndex(),
'Levels' => array_map(function ($numberingLevel) use ($self) {
return $self->numberingLevelStyleToJson($numberingLevel);
}, $style->getLevels()),
];
}
protected function numberingLevelStyleToJson(Style\NumberingLevel $style)
{
return [
'type' => 'numbering_level',
'Level' => $style->getLevel(),
'Start' => $style->getStart(),
'Format' => $style->getFormat(),
'Restart' => $style->getRestart(),
'PStyle' => $style->getPStyle(),
'Suffix' => $style->getSuffix(),
'Text' => $style->getText(),
'Alignment' => $style->getAlignment(),
'Left' => $style->getLeft(),
'Hanging' => $style->getHanging(),
'TabPos' => $style->getTabPos(),
'Font' => $style->getFont(),
'Hint' => $style->getHint(),
];
}
protected function paragraphStyleToJson(Style\Paragraph $style)
{
$styles = [
'Name' => $style->getStyleName(),
'BasedOn' => $style->getBasedOn(),
'Next' => $style->getNext(),
'Alignment' => $style->getAlignment(),
'Indentation' => $style->getIndentation(),
'Spacing' => $style->getSpacing(),
'WidowControl' => $style->hasWidowControl(),
'KeepNext' => $style->isKeepNext(),
'KeepLines' => $style->isKeepLines(),
'PageBreakBefore' => $style->hasPageBreakBefore(),
'NumStyle' => $style->getNumStyle(),
'NumLevel' => $style->getNumLevel(),
'Tabs' => $style->getTabs(),
'Shading' => $style->getShading(),
'ContextualSpacing' => $style->hasContextualSpacing(),
'Bidi' => $style->isBidi(),
'TextAlignment' => $style->getTextAlignment(),
'SuppressAutoHyphens' => $style->hasSuppressAutoHyphens(),
];
$styles['style'] = 'paragraph';
if ( ! $styles['Alignment']) {
$styles['Alignment'] = 'baseline';
}
if ( ! $styles['TextAlignment']) {
$styles['TextAlignment'] = 'baseline';
}
if ($styles['Indentation']) {
$styles['Indentation'] = $this->indentationStyleToJson($styles['Indentation']);
}
if ($styles['Spacing']) {
$styles['Spacing'] = $this->spacingStyleToJson($styles['Spacing']);
}
return $styles;
}
protected function tableStyleToJson($style)
{
if ( ! $style) {
return [];
}
if (is_string($style)) {
return $style;
}
return array_merge(
$this->borderStyleToJson($style),
[
'style' => 'table',
'BgColor' => $style->getBgColor(),
'CellSpacing' => $style->getCellSpacing(),
'Shading' => $style->getShading(),
'Alignment' => $style->getAlignment(),
'Width' => $style->getWidth(),
'Unit' => $style->getUnit(),
'Layout' => $style->getLayout(),
'ColumnWidths' => $style->getColumnWidths(),
'BidiVisual' => $style->isBidiVisual(),
'position' => $this->tablePositionStyleToJson($style->getPosition()),
'first_row' => $this->tableStyleToJson($style->getFirstRow()),
'BorderInsideHSize' => $style->getBorderInsideHSize(),
'BorderInsideHColor' => $style->getBorderInsideHColor(),
'BorderInsideVSize' => $style->getBorderInsideVSize(),
'BorderInsideVColor' => $style->getBorderInsideVColor(),
'CellMarginTop' => $style->getCellMarginTop(),
'CellMarginRight' => $style->getCellMarginRight(),
'CellMarginLeft' => $style->getCellMarginLeft(),
'CellMarginBottom' => $style->getCellMarginBottom(),
]
);
}
protected function tablePositionStyleToJson($style)
{
if ( ! $style) {
return [];
}
return [
'style' => 'table_position',
'LeftFromText' => $style->getLeftFromText(),
'RightFromText' => $style->getRightFromText(),
'TopFromText' => $style->getTopFromText(),
'BottomFromText' => $style->getBottomFromText(),
'VertAnchor' => $style->getVertAnchor(),
'HorzAnchor' => $style->getHorzAnchor(),
'TblpXSpec' => $style->getTblpXSpec(),
'TblpX' => $style->getTblpX(),
'TblpYSpec' => $style->getTblpYSpec(),
'TblpY' => $style->getTblpY(),
];
}
protected function rowStyleToJson($style)
{
return [
'style' => 'row',
'TblHeader' => $style->isTblHeader(),
'CantSplit' => $style->isCantSplit(),
'ExactHeight' => $style->isExactHeight(),
];
}
protected function addText($text)
{
$hash = $this->generateHash();
$this->textContents['text'] .= $text;
$this->textContents['elements'][] = [
'hash' => $hash,
'range_start' => $this->textLength,
'range_end' => $this->textLength + (strlen($text) > 0 ? strlen($text) - 1 : 0),
];
$this->textLength = $this->textLength + (strlen($text) > 0 ? strlen($text) : 1);
return $hash;
}
protected function generateHash()
{
return uniqid();
}
}