html parser from XRevan86
This commit is contained in:
parent
47aab1356e
commit
aa3c2e49cc
|
@ -1,3 +1,5 @@
|
||||||
# hptoad
|
# hptoad
|
||||||
|
|
||||||
An MIT licensed XMPP bot written using Python 3 and slixmpp.
|
An MIT licensed XMPP bot written using Python 3 and slixmpp.
|
||||||
|
|
||||||
|
Original project: https://gitlab.com/XRevan86/hptoad
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
# -*- python -*-
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
class Plugin:
|
||||||
|
_trim_regexp = re.compile("(`|\\$|\\.\\.)")
|
||||||
|
_quote_regexp = re.compile("(\"|')")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _trim(cls, s):
|
||||||
|
result = cls._trim_regexp.sub("", s)
|
||||||
|
result = cls._quote_regexp.sub("“", result).strip()
|
||||||
|
return result
|
||||||
|
|
||||||
|
# letter(ASCII or cyrillic), number, underscore only.
|
||||||
|
_cmd_validator_regexp = re.compile("^(\\w|\\p{Cyrillic})*$")
|
||||||
|
|
||||||
|
@asyncio.coroutine
|
||||||
|
def _exec_cmd(self, cmd, body, nick, dir_path, is_admin):
|
||||||
|
is_admin = "true" if is_admin else "false"
|
||||||
|
path = os.path.join(dir_path, self._trim(cmd))
|
||||||
|
|
||||||
|
if not self._cmd_validator_regexp.match(cmd) or \
|
||||||
|
not os.access(path, os.F_OK | os.X_OK) or not os.path.isfile(path):
|
||||||
|
return {"handled": False}
|
||||||
|
|
||||||
|
if not os.access(path, os.R_OK):
|
||||||
|
return {"handled": True,
|
||||||
|
"error": "\"%s\" is not readable" % path}
|
||||||
|
|
||||||
|
cmd = [path, self._trim(nick), is_admin, self._trim(body)]
|
||||||
|
try:
|
||||||
|
pipe = asyncio.subprocess.PIPE
|
||||||
|
proc = yield from asyncio.create_subprocess_exec(*cmd,
|
||||||
|
stdout=pipe,
|
||||||
|
stderr=pipe)
|
||||||
|
cmd_reply, cmd_error = yield from proc.communicate()
|
||||||
|
except OSError as e:
|
||||||
|
return {"handled": True,
|
||||||
|
"error": "Execute: %s" % str(e)}
|
||||||
|
|
||||||
|
result = {}
|
||||||
|
if cmd_error and len(cmd_error.strip()) > 0:
|
||||||
|
result["error"] = "Process: %s" % cmd_error.strip()
|
||||||
|
if cmd_reply and len(cmd_reply.strip()) > 0:
|
||||||
|
result["reply"] = cmd_reply.decode().strip()
|
||||||
|
if result:
|
||||||
|
result["handled"] = True
|
||||||
|
return result
|
||||||
|
|
||||||
|
@asyncio.coroutine
|
||||||
|
def command(self, command, body, nick, from_id, is_admin):
|
||||||
|
result = yield from self._exec_cmd(command, body, nick,
|
||||||
|
"plugins", is_admin)
|
||||||
|
return result
|
||||||
|
|
||||||
|
@asyncio.coroutine
|
||||||
|
def question(self, body, nick, from_id, is_admin):
|
||||||
|
result = yield from self._exec_cmd("answer", body, nick,
|
||||||
|
"chat", is_admin)
|
||||||
|
return result
|
|
@ -0,0 +1,60 @@
|
||||||
|
# -*- python -*-
|
||||||
|
import asyncio
|
||||||
|
import bs4
|
||||||
|
import functools
|
||||||
|
import lxml
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
|
||||||
|
class Plugin:
|
||||||
|
_html_regexp = re.compile(r"(https?://[^\"\s>]+)")
|
||||||
|
|
||||||
|
@asyncio.coroutine
|
||||||
|
def chat_message(self, body, nick, from_id, is_admin):
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
urls = self._html_regexp.findall(body)
|
||||||
|
if urls:
|
||||||
|
result["handled"] = True
|
||||||
|
mime_types = ("application/xhtml+xml", "application/xml",
|
||||||
|
"text/html", "text/xml")
|
||||||
|
reply = ""
|
||||||
|
for url in urls:
|
||||||
|
try:
|
||||||
|
req = yield from \
|
||||||
|
loop.run_in_executor(None,
|
||||||
|
functools.partial(requests.get,
|
||||||
|
url,
|
||||||
|
stream=True))
|
||||||
|
if req.headers["content-type"].startswith(mime_types):
|
||||||
|
# Handle a case when no charset is defined for text/html.
|
||||||
|
if req.headers["content-type"].startswith("text/") and \
|
||||||
|
not "charset=" in req.headers["content-type"]:
|
||||||
|
req.encoding = None
|
||||||
|
|
||||||
|
if not req.encoding:
|
||||||
|
req.encoding = req.apparent_encoding
|
||||||
|
|
||||||
|
contents = title = ""
|
||||||
|
for i in req.iter_content(chunk_size=128,
|
||||||
|
decode_unicode=True):
|
||||||
|
contents += i
|
||||||
|
soup = bs4.BeautifulSoup(contents, "lxml")
|
||||||
|
if soup and soup.title:
|
||||||
|
if soup.title.string == title:
|
||||||
|
req.close()
|
||||||
|
break
|
||||||
|
title = soup.title.string
|
||||||
|
|
||||||
|
if title:
|
||||||
|
if reply:
|
||||||
|
reply += "\n"
|
||||||
|
reply += "Link: %s" % title
|
||||||
|
except Exception as e:
|
||||||
|
result["error"] = "Title fetch: %s" % str(e)
|
||||||
|
result["reply"] = reply
|
||||||
|
else:
|
||||||
|
result["handled"] = False
|
||||||
|
|
||||||
|
return result
|
Loading…
Reference in New Issue