hptoad/plugins/title_fetch.py

# -*- python -*-
import asyncio
import bs4
import functools
import lxml
import re
import requests

class Plugin:
    _html_regexp = re.compile(r"(https?://[^\"\s>]+)")

    @asyncio.coroutine
    def chat_message(self, body, nick, from_id, is_admin):
        loop = asyncio.get_event_loop()
        result = {}

        urls = self._html_regexp.findall(body)
        if urls:
            result["handled"] = True
            mime_types = ("application/xhtml+xml", "application/xml",
                          "text/html", "text/xml")
            reply = ""
            for url in urls:
                try:
                    req = yield from \
                          loop.run_in_executor(None,
                                               functools.partial(requests.get,
                                                                 url,
                                                                 stream=True))
                    if req.headers["content-type"].startswith(mime_types):
                        # Handle a case when no charset is defined for text/html.
                        if req.headers["content-type"].startswith("text/") and \
                           not "charset=" in req.headers["content-type"]:
                            req.encoding = None

                        if not req.encoding:
                            req.encoding = req.apparent_encoding

                        contents = title = ""
                        for i in req.iter_content(chunk_size=128,
                                                  decode_unicode=True):
                            contents += i
                            soup = bs4.BeautifulSoup(contents, "lxml")
                            if soup and soup.title:
                                if soup.title.string == title:
                                    req.close()
                                    break
                                title = soup.title.string

                        if title:
                            if reply:
                                reply += "\n"
                            reply += "Link: %s" % title
                except Exception as e:
                    result["error"] = "Title fetch: %s" % str(e)
            result["reply"] = reply
        else:
            result["handled"] = False

        return result
html parser from XRevan86 2017-06-18 14:31:35 +00:00			`# -- python --`
			`import asyncio`
			`import bs4`
			`import functools`
			`import lxml`
			`import re`
			`import requests`

			`class Plugin:`
			`_html_regexp = re.compile(r"(https?://[^\"\s>]+)")`

			`@asyncio.coroutine`
			`def chat_message(self, body, nick, from_id, is_admin):`
			`loop = asyncio.get_event_loop()`
			`result = {}`

			`urls = self._html_regexp.findall(body)`
			`if urls:`
			`result["handled"] = True`
			`mime_types = ("application/xhtml+xml", "application/xml",`
			`"text/html", "text/xml")`
			`reply = ""`
			`for url in urls:`
			`try:`
			`req = yield from \`
			`loop.run_in_executor(None,`
			`functools.partial(requests.get,`
			`url,`
			`stream=True))`
			`if req.headers["content-type"].startswith(mime_types):`
			`# Handle a case when no charset is defined for text/html.`
			`if req.headers["content-type"].startswith("text/") and \`
			`not "charset=" in req.headers["content-type"]:`
			`req.encoding = None`

			`if not req.encoding:`
			`req.encoding = req.apparent_encoding`

			`contents = title = ""`
			`for i in req.iter_content(chunk_size=128,`
			`decode_unicode=True):`
			`contents += i`
			`soup = bs4.BeautifulSoup(contents, "lxml")`
			`if soup and soup.title:`
			`if soup.title.string == title:`
			`req.close()`
			`break`
			`title = soup.title.string`

			`if title:`
			`if reply:`
			`reply += "\n"`
			`reply += "Link: %s" % title`
			`except Exception as e:`
			`result["error"] = "Title fetch: %s" % str(e)`
			`result["reply"] = reply`
			`else:`
			`result["handled"] = False`

			`return result`