summaryrefslogtreecommitdiff
path: root/addons/attachment_indexation/models
diff options
context:
space:
mode:
authorstephanchrst <stephanchrst@gmail.com>2022-05-10 21:51:50 +0700
committerstephanchrst <stephanchrst@gmail.com>2022-05-10 21:51:50 +0700
commit3751379f1e9a4c215fb6eb898b4ccc67659b9ace (patch)
treea44932296ef4a9b71d5f010906253d8c53727726 /addons/attachment_indexation/models
parent0a15094050bfde69a06d6eff798e9a8ddf2b8c21 (diff)
initial commit 2
Diffstat (limited to 'addons/attachment_indexation/models')
-rw-r--r--addons/attachment_indexation/models/__init__.py4
-rw-r--r--addons/attachment_indexation/models/ir_attachment.py130
2 files changed, 134 insertions, 0 deletions
diff --git a/addons/attachment_indexation/models/__init__.py b/addons/attachment_indexation/models/__init__.py
new file mode 100644
index 00000000..3f801459
--- /dev/null
+++ b/addons/attachment_indexation/models/__init__.py
@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# Part of Odoo. See LICENSE file for full copyright and licensing details.
+
+from . import ir_attachment
diff --git a/addons/attachment_indexation/models/ir_attachment.py b/addons/attachment_indexation/models/ir_attachment.py
new file mode 100644
index 00000000..c6017f01
--- /dev/null
+++ b/addons/attachment_indexation/models/ir_attachment.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+# Part of Odoo. See LICENSE file for full copyright and licensing details.
+import io
+import logging
+import xml.dom.minidom
+import zipfile
+
+from odoo import api, models
+
+_logger = logging.getLogger(__name__)
+
+try:
+ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+ from pdfminer.converter import TextConverter
+ from pdfminer.pdfpage import PDFPage
+except ImportError:
+ PDFResourceManager = PDFPageInterpreter = TextConverter = PDFPage = None
+ _logger.warning("Attachment indexation of PDF documents is unavailable because the 'pdfminer' Python library cannot be found on the system. "
+ "You may install it from https://pypi.org/project/pdfminer.six/ (e.g. `pip3 install pdfminer.six`)")
+
+FTYPES = ['docx', 'pptx', 'xlsx', 'opendoc', 'pdf']
+
+
+def textToString(element):
+ buff = u""
+ for node in element.childNodes:
+ if node.nodeType == xml.dom.Node.TEXT_NODE:
+ buff += node.nodeValue
+ elif node.nodeType == xml.dom.Node.ELEMENT_NODE:
+ buff += textToString(node)
+ return buff
+
+
+class IrAttachment(models.Model):
+ _inherit = 'ir.attachment'
+
+ def _index_docx(self, bin_data):
+ '''Index Microsoft .docx documents'''
+ buf = u""
+ f = io.BytesIO(bin_data)
+ if zipfile.is_zipfile(f):
+ try:
+ zf = zipfile.ZipFile(f)
+ content = xml.dom.minidom.parseString(zf.read("word/document.xml"))
+ for val in ["w:p", "w:h", "text:list"]:
+ for element in content.getElementsByTagName(val):
+ buf += textToString(element) + "\n"
+ except Exception:
+ pass
+ return buf
+
+ def _index_pptx(self, bin_data):
+ '''Index Microsoft .pptx documents'''
+
+ buf = u""
+ f = io.BytesIO(bin_data)
+ if zipfile.is_zipfile(f):
+ try:
+ zf = zipfile.ZipFile(f)
+ zf_filelist = [x for x in zf.namelist() if x.startswith('ppt/slides/slide')]
+ for i in range(1, len(zf_filelist) + 1):
+ content = xml.dom.minidom.parseString(zf.read('ppt/slides/slide%s.xml' % i))
+ for val in ["a:t"]:
+ for element in content.getElementsByTagName(val):
+ buf += textToString(element) + "\n"
+ except Exception:
+ pass
+ return buf
+
+ def _index_xlsx(self, bin_data):
+ '''Index Microsoft .xlsx documents'''
+
+ buf = u""
+ f = io.BytesIO(bin_data)
+ if zipfile.is_zipfile(f):
+ try:
+ zf = zipfile.ZipFile(f)
+ content = xml.dom.minidom.parseString(zf.read("xl/sharedStrings.xml"))
+ for val in ["t"]:
+ for element in content.getElementsByTagName(val):
+ buf += textToString(element) + "\n"
+ except Exception:
+ pass
+ return buf
+
+ def _index_opendoc(self, bin_data):
+ '''Index OpenDocument documents (.odt, .ods...)'''
+
+ buf = u""
+ f = io.BytesIO(bin_data)
+ if zipfile.is_zipfile(f):
+ try:
+ zf = zipfile.ZipFile(f)
+ content = xml.dom.minidom.parseString(zf.read("content.xml"))
+ for val in ["text:p", "text:h", "text:list"]:
+ for element in content.getElementsByTagName(val):
+ buf += textToString(element) + "\n"
+ except Exception:
+ pass
+ return buf
+
+ def _index_pdf(self, bin_data):
+ '''Index PDF documents'''
+ if PDFResourceManager is None:
+ return
+ buf = u""
+ if bin_data.startswith(b'%PDF-'):
+ f = io.BytesIO(bin_data)
+ try:
+ resource_manager = PDFResourceManager()
+ with io.StringIO() as content, TextConverter(resource_manager, content) as device:
+ logging.getLogger("pdfminer").setLevel(logging.CRITICAL)
+ interpreter = PDFPageInterpreter(resource_manager, device)
+
+ for page in PDFPage.get_pages(f):
+ interpreter.process_page(page)
+
+ buf = content.getvalue()
+ except Exception:
+ pass
+ return buf
+
+ @api.model
+ def _index(self, bin_data, mimetype):
+ for ftype in FTYPES:
+ buf = getattr(self, '_index_%s' % ftype)(bin_data)
+ if buf:
+ return buf.replace('\x00', '')
+
+ return super(IrAttachment, self)._index(bin_data, mimetype)