Browse Source

html parser from XRevan86

master
ivan 2 years ago
parent
commit
aa3c2e49cc
3 changed files with 124 additions and 0 deletions
  1. +2
    -0
      README.md
  2. +62
    -0
      plugins/shell_cmds.py
  3. +60
    -0
      plugins/title_fetch.py

+ 2
- 0
README.md View File

@@ -1,3 +1,5 @@
# hptoad

An MIT licensed XMPP bot written using Python 3 and slixmpp.

Original project: https://gitlab.com/XRevan86/hptoad

+ 62
- 0
plugins/shell_cmds.py View File

@@ -0,0 +1,62 @@
# -*- python -*-
import asyncio
import os
import re

class Plugin:
_trim_regexp = re.compile("(`|\\$|\\.\\.)")
_quote_regexp = re.compile("(\"|')")

@classmethod
def _trim(cls, s):
result = cls._trim_regexp.sub("", s)
result = cls._quote_regexp.sub("“", result).strip()
return result

# letter(ASCII or cyrillic), number, underscore only.
_cmd_validator_regexp = re.compile("^(\\w|\\p{Cyrillic})*$")

@asyncio.coroutine
def _exec_cmd(self, cmd, body, nick, dir_path, is_admin):
is_admin = "true" if is_admin else "false"
path = os.path.join(dir_path, self._trim(cmd))

if not self._cmd_validator_regexp.match(cmd) or \
not os.access(path, os.F_OK | os.X_OK) or not os.path.isfile(path):
return {"handled": False}

if not os.access(path, os.R_OK):
return {"handled": True,
"error": "\"%s\" is not readable" % path}

cmd = [path, self._trim(nick), is_admin, self._trim(body)]
try:
pipe = asyncio.subprocess.PIPE
proc = yield from asyncio.create_subprocess_exec(*cmd,
stdout=pipe,
stderr=pipe)
cmd_reply, cmd_error = yield from proc.communicate()
except OSError as e:
return {"handled": True,
"error": "Execute: %s" % str(e)}

result = {}
if cmd_error and len(cmd_error.strip()) > 0:
result["error"] = "Process: %s" % cmd_error.strip()
if cmd_reply and len(cmd_reply.strip()) > 0:
result["reply"] = cmd_reply.decode().strip()
if result:
result["handled"] = True
return result

@asyncio.coroutine
def command(self, command, body, nick, from_id, is_admin):
result = yield from self._exec_cmd(command, body, nick,
"plugins", is_admin)
return result

@asyncio.coroutine
def question(self, body, nick, from_id, is_admin):
result = yield from self._exec_cmd("answer", body, nick,
"chat", is_admin)
return result

+ 60
- 0
plugins/title_fetch.py View File

@@ -0,0 +1,60 @@
# -*- python -*-
import asyncio
import bs4
import functools
import lxml
import re
import requests

class Plugin:
_html_regexp = re.compile(r"(https?://[^\"\s>]+)")

@asyncio.coroutine
def chat_message(self, body, nick, from_id, is_admin):
loop = asyncio.get_event_loop()
result = {}

urls = self._html_regexp.findall(body)
if urls:
result["handled"] = True
mime_types = ("application/xhtml+xml", "application/xml",
"text/html", "text/xml")
reply = ""
for url in urls:
try:
req = yield from \
loop.run_in_executor(None,
functools.partial(requests.get,
url,
stream=True))
if req.headers["content-type"].startswith(mime_types):
# Handle a case when no charset is defined for text/html.
if req.headers["content-type"].startswith("text/") and \
not "charset=" in req.headers["content-type"]:
req.encoding = None

if not req.encoding:
req.encoding = req.apparent_encoding

contents = title = ""
for i in req.iter_content(chunk_size=128,
decode_unicode=True):
contents += i
soup = bs4.BeautifulSoup(contents, "lxml")
if soup and soup.title:
if soup.title.string == title:
req.close()
break
title = soup.title.string

if title:
if reply:
reply += "\n"
reply += "Link: %s" % title
except Exception as e:
result["error"] = "Title fetch: %s" % str(e)
result["reply"] = reply
else:
result["handled"] = False

return result

Loading…
Cancel
Save