hptoad/plugins/title_fetch.py

# -*- python -*-
import asyncio
import bs4
import functools
import lxml
import re
import requests

class Plugin:
    _html_regexp = re.compile(r"(https?://[^\"\s>]+)")

    @asyncio.coroutine
    def chat_message(self, body, nick, from_id, is_admin):
        loop = asyncio.get_event_loop()
        result = {}

        urls = self._html_regexp.findall(body)
        if urls:
            result["handled"] = True
            mime_types = ("application/xhtml+xml", "application/xml",
                          "text/html", "text/xml")
            reply = ""
            for url in urls:
                try:
                    req = yield from \
                          loop.run_in_executor(None,
                                               functools.partial(requests.get,
                                                                 url,
                                                                 stream=True))
                    if req.headers["content-type"].startswith(mime_types):
                        # Handle a case when no charset is defined for text/html.
                        if req.headers["content-type"].startswith("text/") and \
                           not "charset=" in req.headers["content-type"]:
                            req.encoding = None

                        if not req.encoding:
                            req.encoding = req.apparent_encoding

                        contents = title = ""
                        for i in req.iter_content(chunk_size=128,
                                                  decode_unicode=True):
                            contents += i
                            soup = bs4.BeautifulSoup(contents, "lxml")
                            if soup and soup.title:
                                if soup.title.string == title:
                                    req.close()
                                    break
                                title = soup.title.string

                        if title:
                            if reply:
                                reply += "\n"
                            reply += "Link: %s" % title
                except Exception as e:
                    result["error"] = "Title fetch: %s" % str(e)
            result["reply"] = reply
        else:
            result["handled"] = False

        return result