Browse Source

html parser from XRevan86

master
ivan 2 years ago
parent
commit
aa3c2e49cc
3 changed files with 124 additions and 0 deletions
  1. 2
    0
      README.md
  2. 62
    0
      plugins/shell_cmds.py
  3. 60
    0
      plugins/title_fetch.py

+ 2
- 0
README.md View File

@@ -1,3 +1,5 @@
1 1
 # hptoad
2 2
 
3 3
 An MIT licensed XMPP bot written using Python 3 and slixmpp.
4
+
5
+Original project: https://gitlab.com/XRevan86/hptoad

+ 62
- 0
plugins/shell_cmds.py View File

@@ -0,0 +1,62 @@
1
+# -*- python -*-
2
+import asyncio
3
+import os
4
+import re
5
+
6
+class Plugin:
7
+    _trim_regexp = re.compile("(`|\\$|\\.\\.)")
8
+    _quote_regexp = re.compile("(\"|')")
9
+
10
+    @classmethod
11
+    def _trim(cls, s):
12
+        result = cls._trim_regexp.sub("", s)
13
+        result = cls._quote_regexp.sub("“", result).strip()
14
+        return result
15
+
16
+    # letter(ASCII or cyrillic), number, underscore only.
17
+    _cmd_validator_regexp = re.compile("^(\\w|\\p{Cyrillic})*$")
18
+
19
+    @asyncio.coroutine
20
+    def _exec_cmd(self, cmd, body, nick, dir_path, is_admin):
21
+        is_admin = "true" if is_admin else "false"
22
+        path = os.path.join(dir_path, self._trim(cmd))
23
+
24
+        if not self._cmd_validator_regexp.match(cmd) or \
25
+           not os.access(path, os.F_OK | os.X_OK) or not os.path.isfile(path):
26
+            return {"handled": False}
27
+
28
+        if not os.access(path, os.R_OK):
29
+            return {"handled": True,
30
+                    "error": "\"%s\" is not readable" % path}
31
+
32
+        cmd = [path, self._trim(nick), is_admin, self._trim(body)]
33
+        try:
34
+            pipe = asyncio.subprocess.PIPE
35
+            proc = yield from asyncio.create_subprocess_exec(*cmd,
36
+                                                             stdout=pipe,
37
+                                                             stderr=pipe)
38
+            cmd_reply, cmd_error = yield from proc.communicate()
39
+        except OSError as e:
40
+            return {"handled": True,
41
+                    "error": "Execute: %s" % str(e)}
42
+
43
+        result = {}
44
+        if cmd_error and len(cmd_error.strip()) > 0:
45
+            result["error"] = "Process: %s" % cmd_error.strip()
46
+        if cmd_reply and len(cmd_reply.strip()) > 0:
47
+            result["reply"] = cmd_reply.decode().strip()
48
+        if result:
49
+            result["handled"] = True
50
+        return result
51
+
52
+    @asyncio.coroutine
53
+    def command(self, command, body, nick, from_id, is_admin):
54
+        result = yield from self._exec_cmd(command, body, nick,
55
+                                           "plugins", is_admin)
56
+        return result
57
+
58
+    @asyncio.coroutine
59
+    def question(self, body, nick, from_id, is_admin):
60
+        result = yield from self._exec_cmd("answer", body, nick,
61
+                                           "chat", is_admin)
62
+        return result

+ 60
- 0
plugins/title_fetch.py View File

@@ -0,0 +1,60 @@
1
+# -*- python -*-
2
+import asyncio
3
+import bs4
4
+import functools
5
+import lxml
6
+import re
7
+import requests
8
+
9
+class Plugin:
10
+    _html_regexp = re.compile(r"(https?://[^\"\s>]+)")
11
+
12
+    @asyncio.coroutine
13
+    def chat_message(self, body, nick, from_id, is_admin):
14
+        loop = asyncio.get_event_loop()
15
+        result = {}
16
+
17
+        urls = self._html_regexp.findall(body)
18
+        if urls:
19
+            result["handled"] = True
20
+            mime_types = ("application/xhtml+xml", "application/xml",
21
+                          "text/html", "text/xml")
22
+            reply = ""
23
+            for url in urls:
24
+                try:
25
+                    req = yield from \
26
+                          loop.run_in_executor(None,
27
+                                               functools.partial(requests.get,
28
+                                                                 url,
29
+                                                                 stream=True))
30
+                    if req.headers["content-type"].startswith(mime_types):
31
+                        # Handle a case when no charset is defined for text/html.
32
+                        if req.headers["content-type"].startswith("text/") and \
33
+                           not "charset=" in req.headers["content-type"]:
34
+                            req.encoding = None
35
+
36
+                        if not req.encoding:
37
+                            req.encoding = req.apparent_encoding
38
+
39
+                        contents = title = ""
40
+                        for i in req.iter_content(chunk_size=128,
41
+                                                  decode_unicode=True):
42
+                            contents += i
43
+                            soup = bs4.BeautifulSoup(contents, "lxml")
44
+                            if soup and soup.title:
45
+                                if soup.title.string == title:
46
+                                    req.close()
47
+                                    break
48
+                                title = soup.title.string
49
+
50
+                        if title:
51
+                            if reply:
52
+                                reply += "\n"
53
+                            reply += "Link: %s" % title
54
+                except Exception as e:
55
+                    result["error"] = "Title fetch: %s" % str(e)
56
+            result["reply"] = reply
57
+        else:
58
+            result["handled"] = False
59
+
60
+        return result

Loading…
Cancel
Save