initial commit 2

author: stephanchrst <stephanchrst@gmail.com> 2022-05-10 21:51:50 +0700
committer: stephanchrst <stephanchrst@gmail.com> 2022-05-10 21:51:50 +0700
commit: 3751379f1e9a4c215fb6eb898b4ccc67659b9ace (patch)
tree: a44932296ef4a9b71d5f010906253d8c53727726 /addons/attachment_indexation/models
parent: 0a15094050bfde69a06d6eff798e9a8ddf2b8c21 (diff)
2 files changed, 134 insertions, 0 deletions
diff --git a/addons/attachment_indexation/models/__init__.py b/addons/attachment_indexation/models/__init__.py
new file mode 100644
index 00000000..3f801459
--- /dev/null
+++ b/addons/attachment_indexation/models/__init__.py
@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# Part of Odoo. See LICENSE file for full copyright and licensing details.
+
+from . import ir_attachment
diff --git a/addons/attachment_indexation/models/ir_attachment.py b/addons/attachment_indexation/models/ir_attachment.py
new file mode 100644
index 00000000..c6017f01
--- /dev/null
+++ b/addons/attachment_indexation/models/ir_attachment.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+# Part of Odoo. See LICENSE file for full copyright and licensing details.
+import io
+import logging
+import xml.dom.minidom
+import zipfile
+
+from odoo import api, models
+
+_logger = logging.getLogger(__name__)
+
+try:
+    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+    from pdfminer.converter import TextConverter
+    from pdfminer.pdfpage import PDFPage
+except ImportError:
+    PDFResourceManager = PDFPageInterpreter = TextConverter = PDFPage = None
+    _logger.warning("Attachment indexation of PDF documents is unavailable because the 'pdfminer' Python library cannot be found on the system. "
+                    "You may install it from https://pypi.org/project/pdfminer.six/ (e.g. `pip3 install pdfminer.six`)")
+
+FTYPES = ['docx', 'pptx', 'xlsx', 'opendoc', 'pdf']
+
+
+def textToString(element):
+    buff = u""
+    for node in element.childNodes:
+        if node.nodeType == xml.dom.Node.TEXT_NODE:
+            buff += node.nodeValue
+        elif node.nodeType == xml.dom.Node.ELEMENT_NODE:
+            buff += textToString(node)
+    return buff
+
+
+class IrAttachment(models.Model):
+    _inherit = 'ir.attachment'
+
+    def _index_docx(self, bin_data):
+        '''Index Microsoft .docx documents'''
+        buf = u""
+        f = io.BytesIO(bin_data)
+        if zipfile.is_zipfile(f):
+            try:
+                zf = zipfile.ZipFile(f)
+                content = xml.dom.minidom.parseString(zf.read("word/document.xml"))
+                for val in ["w:p", "w:h", "text:list"]:
+                    for element in content.getElementsByTagName(val):
+                        buf += textToString(element) + "\n"
+            except Exception:
+                pass
+        return buf
+
+    def _index_pptx(self, bin_data):
+        '''Index Microsoft .pptx documents'''
+
+        buf = u""
+        f = io.BytesIO(bin_data)
+        if zipfile.is_zipfile(f):
+            try:
+                zf = zipfile.ZipFile(f)
+                zf_filelist = [x for x in zf.namelist() if x.startswith('ppt/slides/slide')]
+                for i in range(1, len(zf_filelist) + 1):
+                    content = xml.dom.minidom.parseString(zf.read('ppt/slides/slide%s.xml' % i))
+                    for val in ["a:t"]:
+                        for element in content.getElementsByTagName(val):
+                            buf += textToString(element) + "\n"
+            except Exception:
+                pass
+        return buf
+
+    def _index_xlsx(self, bin_data):
+        '''Index Microsoft .xlsx documents'''
+
+        buf = u""
+        f = io.BytesIO(bin_data)
+        if zipfile.is_zipfile(f):
+            try:
+                zf = zipfile.ZipFile(f)
+                content = xml.dom.minidom.parseString(zf.read("xl/sharedStrings.xml"))
+                for val in ["t"]:
+                    for element in content.getElementsByTagName(val):
+                        buf += textToString(element) + "\n"
+            except Exception:
+                pass
+        return buf
+
+    def _index_opendoc(self, bin_data):
+        '''Index OpenDocument documents (.odt, .ods...)'''
+
+        buf = u""
+        f = io.BytesIO(bin_data)
+        if zipfile.is_zipfile(f):
+            try:
+                zf = zipfile.ZipFile(f)
+                content = xml.dom.minidom.parseString(zf.read("content.xml"))
+                for val in ["text:p", "text:h", "text:list"]:
+                    for element in content.getElementsByTagName(val):
+                        buf += textToString(element) + "\n"
+            except Exception:
+                pass
+        return buf
+
+    def _index_pdf(self, bin_data):
+        '''Index PDF documents'''
+        if PDFResourceManager is None:
+            return
+        buf = u""
+        if bin_data.startswith(b'%PDF-'):
+            f = io.BytesIO(bin_data)
+            try:
+                resource_manager = PDFResourceManager()
+                with io.StringIO() as content, TextConverter(resource_manager, content) as device:
+                    logging.getLogger("pdfminer").setLevel(logging.CRITICAL)
+                    interpreter = PDFPageInterpreter(resource_manager, device)
+
+                    for page in PDFPage.get_pages(f):
+                        interpreter.process_page(page)
+
+                    buf = content.getvalue()
+            except Exception:
+                pass
+        return buf
+
+    @api.model
+    def _index(self, bin_data, mimetype):
+        for ftype in FTYPES:
+            buf = getattr(self, '_index_%s' % ftype)(bin_data)
+            if buf:
+                return buf.replace('\x00', '')
+
+        return super(IrAttachment, self)._index(bin_data, mimetype)
author	stephanchrst <stephanchrst@gmail.com>	2022-05-10 21:51:50 +0700
committer	stephanchrst <stephanchrst@gmail.com>	2022-05-10 21:51:50 +0700
commit	3751379f1e9a4c215fb6eb898b4ccc67659b9ace (patch)
tree	a44932296ef4a9b71d5f010906253d8c53727726 /addons/attachment_indexation/models
parent	0a15094050bfde69a06d6eff798e9a8ddf2b8c21 (diff)