html parser from XRevan86

这个提交包含在:
ivan 2017-06-18 17:31:35 +03:00
父节点 47aab1356e
当前提交 aa3c2e49cc
共有 3 个文件被更改,包括 124 次插入0 次删除

查看文件

@ -1,3 +1,5 @@
# hptoad
An MIT licensed XMPP bot written using Python 3 and slixmpp.
Original project: https://gitlab.com/XRevan86/hptoad

62
plugins/shell_cmds.py 普通文件
查看文件

@ -0,0 +1,62 @@
# -*- python -*-
import asyncio
import os
import re
class Plugin:
_trim_regexp = re.compile("(`|\\$|\\.\\.)")
_quote_regexp = re.compile("(\"|')")
@classmethod
def _trim(cls, s):
result = cls._trim_regexp.sub("", s)
result = cls._quote_regexp.sub("", result).strip()
return result
# letter(ASCII or cyrillic), number, underscore only.
_cmd_validator_regexp = re.compile("^(\\w|\\p{Cyrillic})*$")
@asyncio.coroutine
def _exec_cmd(self, cmd, body, nick, dir_path, is_admin):
is_admin = "true" if is_admin else "false"
path = os.path.join(dir_path, self._trim(cmd))
if not self._cmd_validator_regexp.match(cmd) or \
not os.access(path, os.F_OK | os.X_OK) or not os.path.isfile(path):
return {"handled": False}
if not os.access(path, os.R_OK):
return {"handled": True,
"error": "\"%s\" is not readable" % path}
cmd = [path, self._trim(nick), is_admin, self._trim(body)]
try:
pipe = asyncio.subprocess.PIPE
proc = yield from asyncio.create_subprocess_exec(*cmd,
stdout=pipe,
stderr=pipe)
cmd_reply, cmd_error = yield from proc.communicate()
except OSError as e:
return {"handled": True,
"error": "Execute: %s" % str(e)}
result = {}
if cmd_error and len(cmd_error.strip()) > 0:
result["error"] = "Process: %s" % cmd_error.strip()
if cmd_reply and len(cmd_reply.strip()) > 0:
result["reply"] = cmd_reply.decode().strip()
if result:
result["handled"] = True
return result
@asyncio.coroutine
def command(self, command, body, nick, from_id, is_admin):
result = yield from self._exec_cmd(command, body, nick,
"plugins", is_admin)
return result
@asyncio.coroutine
def question(self, body, nick, from_id, is_admin):
result = yield from self._exec_cmd("answer", body, nick,
"chat", is_admin)
return result

60
plugins/title_fetch.py 普通文件
查看文件

@ -0,0 +1,60 @@
# -*- python -*-
import asyncio
import bs4
import functools
import lxml
import re
import requests
class Plugin:
_html_regexp = re.compile(r"(https?://[^\"\s>]+)")
@asyncio.coroutine
def chat_message(self, body, nick, from_id, is_admin):
loop = asyncio.get_event_loop()
result = {}
urls = self._html_regexp.findall(body)
if urls:
result["handled"] = True
mime_types = ("application/xhtml+xml", "application/xml",
"text/html", "text/xml")
reply = ""
for url in urls:
try:
req = yield from \
loop.run_in_executor(None,
functools.partial(requests.get,
url,
stream=True))
if req.headers["content-type"].startswith(mime_types):
# Handle a case when no charset is defined for text/html.
if req.headers["content-type"].startswith("text/") and \
not "charset=" in req.headers["content-type"]:
req.encoding = None
if not req.encoding:
req.encoding = req.apparent_encoding
contents = title = ""
for i in req.iter_content(chunk_size=128,
decode_unicode=True):
contents += i
soup = bs4.BeautifulSoup(contents, "lxml")
if soup and soup.title:
if soup.title.string == title:
req.close()
break
title = soup.title.string
if title:
if reply:
reply += "\n"
reply += "Link: %s" % title
except Exception as e:
result["error"] = "Title fetch: %s" % str(e)
result["reply"] = reply
else:
result["handled"] = False
return result