html parser from XRevan86
This commit is contained in:
parent
47aab1356e
commit
aa3c2e49cc
|
@ -1,3 +1,5 @@
|
|||
# hptoad
|
||||
|
||||
An MIT licensed XMPP bot written using Python 3 and slixmpp.
|
||||
|
||||
Original project: https://gitlab.com/XRevan86/hptoad
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
# -*- python -*-
|
||||
import asyncio
|
||||
import os
|
||||
import re
|
||||
|
||||
class Plugin:
|
||||
_trim_regexp = re.compile("(`|\\$|\\.\\.)")
|
||||
_quote_regexp = re.compile("(\"|')")
|
||||
|
||||
@classmethod
|
||||
def _trim(cls, s):
|
||||
result = cls._trim_regexp.sub("", s)
|
||||
result = cls._quote_regexp.sub("“", result).strip()
|
||||
return result
|
||||
|
||||
# letter(ASCII or cyrillic), number, underscore only.
|
||||
_cmd_validator_regexp = re.compile("^(\\w|\\p{Cyrillic})*$")
|
||||
|
||||
@asyncio.coroutine
|
||||
def _exec_cmd(self, cmd, body, nick, dir_path, is_admin):
|
||||
is_admin = "true" if is_admin else "false"
|
||||
path = os.path.join(dir_path, self._trim(cmd))
|
||||
|
||||
if not self._cmd_validator_regexp.match(cmd) or \
|
||||
not os.access(path, os.F_OK | os.X_OK) or not os.path.isfile(path):
|
||||
return {"handled": False}
|
||||
|
||||
if not os.access(path, os.R_OK):
|
||||
return {"handled": True,
|
||||
"error": "\"%s\" is not readable" % path}
|
||||
|
||||
cmd = [path, self._trim(nick), is_admin, self._trim(body)]
|
||||
try:
|
||||
pipe = asyncio.subprocess.PIPE
|
||||
proc = yield from asyncio.create_subprocess_exec(*cmd,
|
||||
stdout=pipe,
|
||||
stderr=pipe)
|
||||
cmd_reply, cmd_error = yield from proc.communicate()
|
||||
except OSError as e:
|
||||
return {"handled": True,
|
||||
"error": "Execute: %s" % str(e)}
|
||||
|
||||
result = {}
|
||||
if cmd_error and len(cmd_error.strip()) > 0:
|
||||
result["error"] = "Process: %s" % cmd_error.strip()
|
||||
if cmd_reply and len(cmd_reply.strip()) > 0:
|
||||
result["reply"] = cmd_reply.decode().strip()
|
||||
if result:
|
||||
result["handled"] = True
|
||||
return result
|
||||
|
||||
@asyncio.coroutine
|
||||
def command(self, command, body, nick, from_id, is_admin):
|
||||
result = yield from self._exec_cmd(command, body, nick,
|
||||
"plugins", is_admin)
|
||||
return result
|
||||
|
||||
@asyncio.coroutine
|
||||
def question(self, body, nick, from_id, is_admin):
|
||||
result = yield from self._exec_cmd("answer", body, nick,
|
||||
"chat", is_admin)
|
||||
return result
|
|
@ -0,0 +1,60 @@
|
|||
# -*- python -*-
|
||||
import asyncio
|
||||
import bs4
|
||||
import functools
|
||||
import lxml
|
||||
import re
|
||||
import requests
|
||||
|
||||
class Plugin:
|
||||
_html_regexp = re.compile(r"(https?://[^\"\s>]+)")
|
||||
|
||||
@asyncio.coroutine
|
||||
def chat_message(self, body, nick, from_id, is_admin):
|
||||
loop = asyncio.get_event_loop()
|
||||
result = {}
|
||||
|
||||
urls = self._html_regexp.findall(body)
|
||||
if urls:
|
||||
result["handled"] = True
|
||||
mime_types = ("application/xhtml+xml", "application/xml",
|
||||
"text/html", "text/xml")
|
||||
reply = ""
|
||||
for url in urls:
|
||||
try:
|
||||
req = yield from \
|
||||
loop.run_in_executor(None,
|
||||
functools.partial(requests.get,
|
||||
url,
|
||||
stream=True))
|
||||
if req.headers["content-type"].startswith(mime_types):
|
||||
# Handle a case when no charset is defined for text/html.
|
||||
if req.headers["content-type"].startswith("text/") and \
|
||||
not "charset=" in req.headers["content-type"]:
|
||||
req.encoding = None
|
||||
|
||||
if not req.encoding:
|
||||
req.encoding = req.apparent_encoding
|
||||
|
||||
contents = title = ""
|
||||
for i in req.iter_content(chunk_size=128,
|
||||
decode_unicode=True):
|
||||
contents += i
|
||||
soup = bs4.BeautifulSoup(contents, "lxml")
|
||||
if soup and soup.title:
|
||||
if soup.title.string == title:
|
||||
req.close()
|
||||
break
|
||||
title = soup.title.string
|
||||
|
||||
if title:
|
||||
if reply:
|
||||
reply += "\n"
|
||||
reply += "Link: %s" % title
|
||||
except Exception as e:
|
||||
result["error"] = "Title fetch: %s" % str(e)
|
||||
result["reply"] = reply
|
||||
else:
|
||||
result["handled"] = False
|
||||
|
||||
return result
|
Loading…
Reference in New Issue