html parser from XRevan86
This commit is contained in:
		
							parent
							
								
									47aab1356e
								
							
						
					
					
						commit
						aa3c2e49cc
					
				|  | @ -1,3 +1,5 @@ | ||||||
| # hptoad | # hptoad | ||||||
| 
 | 
 | ||||||
| An MIT licensed XMPP bot written using Python 3 and slixmpp. | An MIT licensed XMPP bot written using Python 3 and slixmpp. | ||||||
|  | 
 | ||||||
|  | Original project: https://gitlab.com/XRevan86/hptoad | ||||||
|  |  | ||||||
|  | @ -0,0 +1,62 @@ | ||||||
|  | # -*- python -*- | ||||||
|  | import asyncio | ||||||
|  | import os | ||||||
|  | import re | ||||||
|  | 
 | ||||||
|  | class Plugin: | ||||||
|  |     _trim_regexp = re.compile("(`|\\$|\\.\\.)") | ||||||
|  |     _quote_regexp = re.compile("(\"|')") | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def _trim(cls, s): | ||||||
|  |         result = cls._trim_regexp.sub("", s) | ||||||
|  |         result = cls._quote_regexp.sub("“", result).strip() | ||||||
|  |         return result | ||||||
|  | 
 | ||||||
|  |     # letter(ASCII or cyrillic), number, underscore only. | ||||||
|  |     _cmd_validator_regexp = re.compile("^(\\w|\\p{Cyrillic})*$") | ||||||
|  | 
 | ||||||
|  |     @asyncio.coroutine | ||||||
|  |     def _exec_cmd(self, cmd, body, nick, dir_path, is_admin): | ||||||
|  |         is_admin = "true" if is_admin else "false" | ||||||
|  |         path = os.path.join(dir_path, self._trim(cmd)) | ||||||
|  | 
 | ||||||
|  |         if not self._cmd_validator_regexp.match(cmd) or \ | ||||||
|  |            not os.access(path, os.F_OK | os.X_OK) or not os.path.isfile(path): | ||||||
|  |             return {"handled": False} | ||||||
|  | 
 | ||||||
|  |         if not os.access(path, os.R_OK): | ||||||
|  |             return {"handled": True, | ||||||
|  |                     "error": "\"%s\" is not readable" % path} | ||||||
|  | 
 | ||||||
|  |         cmd = [path, self._trim(nick), is_admin, self._trim(body)] | ||||||
|  |         try: | ||||||
|  |             pipe = asyncio.subprocess.PIPE | ||||||
|  |             proc = yield from asyncio.create_subprocess_exec(*cmd, | ||||||
|  |                                                              stdout=pipe, | ||||||
|  |                                                              stderr=pipe) | ||||||
|  |             cmd_reply, cmd_error = yield from proc.communicate() | ||||||
|  |         except OSError as e: | ||||||
|  |             return {"handled": True, | ||||||
|  |                     "error": "Execute: %s" % str(e)} | ||||||
|  | 
 | ||||||
|  |         result = {} | ||||||
|  |         if cmd_error and len(cmd_error.strip()) > 0: | ||||||
|  |             result["error"] = "Process: %s" % cmd_error.strip() | ||||||
|  |         if cmd_reply and len(cmd_reply.strip()) > 0: | ||||||
|  |             result["reply"] = cmd_reply.decode().strip() | ||||||
|  |         if result: | ||||||
|  |             result["handled"] = True | ||||||
|  |         return result | ||||||
|  | 
 | ||||||
|  |     @asyncio.coroutine | ||||||
|  |     def command(self, command, body, nick, from_id, is_admin): | ||||||
|  |         result = yield from self._exec_cmd(command, body, nick, | ||||||
|  |                                            "plugins", is_admin) | ||||||
|  |         return result | ||||||
|  | 
 | ||||||
|  |     @asyncio.coroutine | ||||||
|  |     def question(self, body, nick, from_id, is_admin): | ||||||
|  |         result = yield from self._exec_cmd("answer", body, nick, | ||||||
|  |                                            "chat", is_admin) | ||||||
|  |         return result | ||||||
|  | @ -0,0 +1,60 @@ | ||||||
|  | # -*- python -*- | ||||||
|  | import asyncio | ||||||
|  | import bs4 | ||||||
|  | import functools | ||||||
|  | import lxml | ||||||
|  | import re | ||||||
|  | import requests | ||||||
|  | 
 | ||||||
|  | class Plugin: | ||||||
|  |     _html_regexp = re.compile(r"(https?://[^\"\s>]+)") | ||||||
|  | 
 | ||||||
|  |     @asyncio.coroutine | ||||||
|  |     def chat_message(self, body, nick, from_id, is_admin): | ||||||
|  |         loop = asyncio.get_event_loop() | ||||||
|  |         result = {} | ||||||
|  | 
 | ||||||
|  |         urls = self._html_regexp.findall(body) | ||||||
|  |         if urls: | ||||||
|  |             result["handled"] = True | ||||||
|  |             mime_types = ("application/xhtml+xml", "application/xml", | ||||||
|  |                           "text/html", "text/xml") | ||||||
|  |             reply = "" | ||||||
|  |             for url in urls: | ||||||
|  |                 try: | ||||||
|  |                     req = yield from \ | ||||||
|  |                           loop.run_in_executor(None, | ||||||
|  |                                                functools.partial(requests.get, | ||||||
|  |                                                                  url, | ||||||
|  |                                                                  stream=True)) | ||||||
|  |                     if req.headers["content-type"].startswith(mime_types): | ||||||
|  |                         # Handle a case when no charset is defined for text/html. | ||||||
|  |                         if req.headers["content-type"].startswith("text/") and \ | ||||||
|  |                            not "charset=" in req.headers["content-type"]: | ||||||
|  |                             req.encoding = None | ||||||
|  | 
 | ||||||
|  |                         if not req.encoding: | ||||||
|  |                             req.encoding = req.apparent_encoding | ||||||
|  | 
 | ||||||
|  |                         contents = title = "" | ||||||
|  |                         for i in req.iter_content(chunk_size=128, | ||||||
|  |                                                   decode_unicode=True): | ||||||
|  |                             contents += i | ||||||
|  |                             soup = bs4.BeautifulSoup(contents, "lxml") | ||||||
|  |                             if soup and soup.title: | ||||||
|  |                                 if soup.title.string == title: | ||||||
|  |                                     req.close() | ||||||
|  |                                     break | ||||||
|  |                                 title = soup.title.string | ||||||
|  | 
 | ||||||
|  |                         if title: | ||||||
|  |                             if reply: | ||||||
|  |                                 reply += "\n" | ||||||
|  |                             reply += "Link: %s" % title | ||||||
|  |                 except Exception as e: | ||||||
|  |                     result["error"] = "Title fetch: %s" % str(e) | ||||||
|  |             result["reply"] = reply | ||||||
|  |         else: | ||||||
|  |             result["handled"] = False | ||||||
|  | 
 | ||||||
|  |         return result | ||||||
		Loading…
	
		Reference in New Issue