diff options
| author | stephanchrst <stephanchrst@gmail.com> | 2022-05-10 21:51:50 +0700 |
|---|---|---|
| committer | stephanchrst <stephanchrst@gmail.com> | 2022-05-10 21:51:50 +0700 |
| commit | 3751379f1e9a4c215fb6eb898b4ccc67659b9ace (patch) | |
| tree | a44932296ef4a9b71d5f010906253d8c53727726 /addons/attachment_indexation/models | |
| parent | 0a15094050bfde69a06d6eff798e9a8ddf2b8c21 (diff) | |
initial commit 2
Diffstat (limited to 'addons/attachment_indexation/models')
| -rw-r--r-- | addons/attachment_indexation/models/__init__.py | 4 | ||||
| -rw-r--r-- | addons/attachment_indexation/models/ir_attachment.py | 130 |
2 files changed, 134 insertions, 0 deletions
diff --git a/addons/attachment_indexation/models/__init__.py b/addons/attachment_indexation/models/__init__.py new file mode 100644 index 00000000..3f801459 --- /dev/null +++ b/addons/attachment_indexation/models/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# Part of Odoo. See LICENSE file for full copyright and licensing details. + +from . import ir_attachment diff --git a/addons/attachment_indexation/models/ir_attachment.py b/addons/attachment_indexation/models/ir_attachment.py new file mode 100644 index 00000000..c6017f01 --- /dev/null +++ b/addons/attachment_indexation/models/ir_attachment.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- +# Part of Odoo. See LICENSE file for full copyright and licensing details. +import io +import logging +import xml.dom.minidom +import zipfile + +from odoo import api, models + +_logger = logging.getLogger(__name__) + +try: + from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter + from pdfminer.converter import TextConverter + from pdfminer.pdfpage import PDFPage +except ImportError: + PDFResourceManager = PDFPageInterpreter = TextConverter = PDFPage = None + _logger.warning("Attachment indexation of PDF documents is unavailable because the 'pdfminer' Python library cannot be found on the system. " + "You may install it from https://pypi.org/project/pdfminer.six/ (e.g. `pip3 install pdfminer.six`)") + +FTYPES = ['docx', 'pptx', 'xlsx', 'opendoc', 'pdf'] + + +def textToString(element): + buff = u"" + for node in element.childNodes: + if node.nodeType == xml.dom.Node.TEXT_NODE: + buff += node.nodeValue + elif node.nodeType == xml.dom.Node.ELEMENT_NODE: + buff += textToString(node) + return buff + + +class IrAttachment(models.Model): + _inherit = 'ir.attachment' + + def _index_docx(self, bin_data): + '''Index Microsoft .docx documents''' + buf = u"" + f = io.BytesIO(bin_data) + if zipfile.is_zipfile(f): + try: + zf = zipfile.ZipFile(f) + content = xml.dom.minidom.parseString(zf.read("word/document.xml")) + for val in ["w:p", "w:h", "text:list"]: + for element in content.getElementsByTagName(val): + buf += textToString(element) + "\n" + except Exception: + pass + return buf + + def _index_pptx(self, bin_data): + '''Index Microsoft .pptx documents''' + + buf = u"" + f = io.BytesIO(bin_data) + if zipfile.is_zipfile(f): + try: + zf = zipfile.ZipFile(f) + zf_filelist = [x for x in zf.namelist() if x.startswith('ppt/slides/slide')] + for i in range(1, len(zf_filelist) + 1): + content = xml.dom.minidom.parseString(zf.read('ppt/slides/slide%s.xml' % i)) + for val in ["a:t"]: + for element in content.getElementsByTagName(val): + buf += textToString(element) + "\n" + except Exception: + pass + return buf + + def _index_xlsx(self, bin_data): + '''Index Microsoft .xlsx documents''' + + buf = u"" + f = io.BytesIO(bin_data) + if zipfile.is_zipfile(f): + try: + zf = zipfile.ZipFile(f) + content = xml.dom.minidom.parseString(zf.read("xl/sharedStrings.xml")) + for val in ["t"]: + for element in content.getElementsByTagName(val): + buf += textToString(element) + "\n" + except Exception: + pass + return buf + + def _index_opendoc(self, bin_data): + '''Index OpenDocument documents (.odt, .ods...)''' + + buf = u"" + f = io.BytesIO(bin_data) + if zipfile.is_zipfile(f): + try: + zf = zipfile.ZipFile(f) + content = xml.dom.minidom.parseString(zf.read("content.xml")) + for val in ["text:p", "text:h", "text:list"]: + for element in content.getElementsByTagName(val): + buf += textToString(element) + "\n" + except Exception: + pass + return buf + + def _index_pdf(self, bin_data): + '''Index PDF documents''' + if PDFResourceManager is None: + return + buf = u"" + if bin_data.startswith(b'%PDF-'): + f = io.BytesIO(bin_data) + try: + resource_manager = PDFResourceManager() + with io.StringIO() as content, TextConverter(resource_manager, content) as device: + logging.getLogger("pdfminer").setLevel(logging.CRITICAL) + interpreter = PDFPageInterpreter(resource_manager, device) + + for page in PDFPage.get_pages(f): + interpreter.process_page(page) + + buf = content.getvalue() + except Exception: + pass + return buf + + @api.model + def _index(self, bin_data, mimetype): + for ftype in FTYPES: + buf = getattr(self, '_index_%s' % ftype)(bin_data) + if buf: + return buf.replace('\x00', '') + + return super(IrAttachment, self)._index(bin_data, mimetype) |
