hptoad/plugins/title_fetch.py

60 lines
2.3 KiB
Python

# -*- python -*-
import asyncio
import bs4
import functools
import lxml
import re
import requests
class Plugin:
_html_regexp = re.compile(r"(https?://[^\"\s>]+)")
@asyncio.coroutine
def chat_message(self, body, nick, from_id, is_admin):
loop = asyncio.get_event_loop()
result = {}
urls = self._html_regexp.findall(body)
if urls:
result["handled"] = True
mime_types = ("application/xhtml+xml", "application/xml",
"text/html", "text/xml")
reply = ""
for url in urls:
try:
req = yield from \
loop.run_in_executor(None,
functools.partial(requests.get,
url,
stream=True))
if req.headers["content-type"].startswith(mime_types):
# Handle a case when no charset is defined for text/html.
if req.headers["content-type"].startswith("text/") and \
not "charset=" in req.headers["content-type"]:
req.encoding = None
if not req.encoding:
req.encoding = req.apparent_encoding
contents = title = ""
for i in req.iter_content(chunk_size=128,
decode_unicode=True):
contents += i
soup = bs4.BeautifulSoup(contents, "lxml")
if soup and soup.title:
if soup.title.string == title:
req.close()
break
title = soup.title.string
if title:
if reply:
reply += "\n"
reply += "Link: %s" % title
except Exception as e:
result["error"] = "Title fetch: %s" % str(e)
result["reply"] = reply
else:
result["handled"] = False
return result