html parser from XRevan86

2017-06-18 17:31:35 +03:00
parent 47aab1356e
commit aa3c2e49cc
3 changed files with 124 additions and 0 deletions
@@ -1,3 +1,5 @@
 # hptoad

 An MIT licensed XMPP bot written using Python 3 and slixmpp.
+
+Original project: https://gitlab.com/XRevan86/hptoad
@@ -0,0 +1,62 @@
+# -*- python -*-
+import asyncio
+import os
+import re
+
+class Plugin:
+    _trim_regexp = re.compile("(`|\\$|\\.\\.)")
+    _quote_regexp = re.compile("(\"|')")
+
+    @classmethod
+    def _trim(cls, s):
+        result = cls._trim_regexp.sub("", s)
+        result = cls._quote_regexp.sub("“", result).strip()
+        return result
+
+    # letter(ASCII or cyrillic), number, underscore only.
+    _cmd_validator_regexp = re.compile("^(\\w|\\p{Cyrillic})*$")
+
+    @asyncio.coroutine
+    def _exec_cmd(self, cmd, body, nick, dir_path, is_admin):
+        is_admin = "true" if is_admin else "false"
+        path = os.path.join(dir_path, self._trim(cmd))
+
+        if not self._cmd_validator_regexp.match(cmd) or \
+           not os.access(path, os.F_OK | os.X_OK) or not os.path.isfile(path):
+            return {"handled": False}
+
+        if not os.access(path, os.R_OK):
+            return {"handled": True,
+                    "error": "\"%s\" is not readable" % path}
+
+        cmd = [path, self._trim(nick), is_admin, self._trim(body)]
+        try:
+            pipe = asyncio.subprocess.PIPE
+            proc = yield from asyncio.create_subprocess_exec(*cmd,
+                                                             stdout=pipe,
+                                                             stderr=pipe)
+            cmd_reply, cmd_error = yield from proc.communicate()
+        except OSError as e:
+            return {"handled": True,
+                    "error": "Execute: %s" % str(e)}
+
+        result = {}
+        if cmd_error and len(cmd_error.strip()) > 0:
+            result["error"] = "Process: %s" % cmd_error.strip()
+        if cmd_reply and len(cmd_reply.strip()) > 0:
+            result["reply"] = cmd_reply.decode().strip()
+        if result:
+            result["handled"] = True
+        return result
+
+    @asyncio.coroutine
+    def command(self, command, body, nick, from_id, is_admin):
+        result = yield from self._exec_cmd(command, body, nick,
+                                           "plugins", is_admin)
+        return result
+
+    @asyncio.coroutine
+    def question(self, body, nick, from_id, is_admin):
+        result = yield from self._exec_cmd("answer", body, nick,
+                                           "chat", is_admin)
+        return result
@@ -0,0 +1,60 @@
+# -*- python -*-
+import asyncio
+import bs4
+import functools
+import lxml
+import re
+import requests
+
+class Plugin:
+    _html_regexp = re.compile(r"(https?://[^\"\s>]+)")
+
+    @asyncio.coroutine
+    def chat_message(self, body, nick, from_id, is_admin):
+        loop = asyncio.get_event_loop()
+        result = {}
+
+        urls = self._html_regexp.findall(body)
+        if urls:
+            result["handled"] = True
+            mime_types = ("application/xhtml+xml", "application/xml",
+                          "text/html", "text/xml")
+            reply = ""
+            for url in urls:
+                try:
+                    req = yield from \
+                          loop.run_in_executor(None,
+                                               functools.partial(requests.get,
+                                                                 url,
+                                                                 stream=True))
+                    if req.headers["content-type"].startswith(mime_types):
+                        # Handle a case when no charset is defined for text/html.
+                        if req.headers["content-type"].startswith("text/") and \
+                           not "charset=" in req.headers["content-type"]:
+                            req.encoding = None
+
+                        if not req.encoding:
+                            req.encoding = req.apparent_encoding
+
+                        contents = title = ""
+                        for i in req.iter_content(chunk_size=128,
+                                                  decode_unicode=True):
+                            contents += i
+                            soup = bs4.BeautifulSoup(contents, "lxml")
+                            if soup and soup.title:
+                                if soup.title.string == title:
+                                    req.close()
+                                    break
+                                title = soup.title.string
+
+                        if title:
+                            if reply:
+                                reply += "\n"
+                            reply += "Link: %s" % title
+                except Exception as e:
+                    result["error"] = "Title fetch: %s" % str(e)
+            result["reply"] = reply
+        else:
+            result["handled"] = False
+
+        return result