X-Git-Url: http://git.kaliko.me/?a=blobdiff_plain;f=sid%2Ffeeds.py;h=5a33d6d5ccb40dbaa2c1f88d3c5a1795d02e16aa;hb=HEAD;hp=dd3c78116361e50ac0b217d12644b6b68ca32250;hpb=6111ca5fcf8cb38596130688f7883f3e87f4c362;p=sid.git diff --git a/sid/feeds.py b/sid/feeds.py index dd3c781..3fe7ce9 100644 --- a/sid/feeds.py +++ b/sid/feeds.py @@ -1,24 +1,30 @@ # -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: 2011, 2014, 2020 kaliko +# SPDX-License-Identifier: GPL-3.0-or-later +"""Publish news from various Debian feeds (security, planet, package tracker, see :py:obj:`sid.feeds.Feeds.FEEDS` for defaults). -# Copyright (C) 2011, 2014 kaliko +Can easily be used for other feeds (rss, atom). -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, version 3 only. +.. note:: + Feeds plugin depends on external module: **feedparser** -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . +>>> from sid.feeds import Feeds +>>> # Time between check in seconds +>>> Feeds.TEMPO = 60 +>>> # Fedds to monitor, cf. sid.feeds.Feeds.FEEDS for defaults +>>> Feeds.FEEDS = [ + 'https://example.org/feeds/atom/news.atom.xml' + ] +""" import datetime import threading import time import traceback +from urllib.error import URLError +from urllib.parse import urlparse + from feedparser import parse as feed_parse from .plugin import Plugin, botcmd @@ -52,61 +58,69 @@ class FeedMonitor(threading.Thread): self.seen = dict() self.thread_killed = False + def _update_cache(self, feed, parsed): + self.seen[feed].update({'ids': {p.id for p in parsed.entries} or {}}) + # Common HTTP caching + if parsed.get('etag', False): + self.seen[feed].update({'cache': {'etag': parsed.etag}}) + if parsed.get('modified', False): + self.seen[feed].update({'cache': {'modified': parsed.modified}}) + def new_posts(self, feed): """Send new posts in feed""" - parsed_feed = feed_parse(feed) - + self.plugin.log.debug('feed: : "%s"', feed) + if self.seen.get(feed) and self.seen.get(feed).get('cache'): + parsed_feed = feed_parse(feed, **self.seen[feed]['cache']) + else: + if self.seen.get(feed): + self.plugin.log.debug('No cache headers set (etag/modified)') + parsed_feed = feed_parse(feed) # Cannot resolve address if 'status' not in parsed_feed: - self.plugin.log.error('Error from "%s": %s.' % - (feed, parsed_feed.bozo_exception.__repr__())) + self.plugin.log.error('Error from "%s": %s.', + feed, parsed_feed.bozo_exception.__repr__()) + return + # http caching + if parsed_feed.status == 304: + self.plugin.log.debug('Got 304 not modified') return - # unusual return http code if parsed_feed.status != 200: self.plugin.log.warning( - 'Got code %(status)d from "%(href)s" (please update).' % - parsed_feed) + 'Got code %(status)d from "%(href)s" (please update).', + parsed_feed) return - - feed_updated = parsed_feed.feed.get('updated_parsed', None) - - # Avoid looping over all posts if possible - if feed_updated and strtm_to_dtm(feed_updated) < self.last_check: - self.plugin.log.debug('updated : %s' % strtm_to_dtm(feed_updated)) - self.plugin.log.debug('last check: %s' % self.last_check) + if not self.seen.setdefault(feed): + # Fills with post id when first started (prevent from posting all + # entries at startup) + self.seen[feed] = {'cache': None} + self._update_cache(feed, parsed_feed) return - title = '"%s":' % parsed_feed.feed.get('title', 'n/a') xtitle = '%s:' % html_escape( parsed_feed.feed.get('title', 'n/a')) text = [title] xhtml = [xtitle] - feed_id = parsed_feed.feed.get('id', feed) - if not self.seen.setdefault(feed_id): - # Fills with post id when first started (prevent from posting all - # entries at startup) - self.seen[feed_id] = {p.id for p in parsed_feed.entries} - return # Detecting new post entries = {p.id for p in parsed_feed.entries} + seen_ids = self.seen.get(feed).get('ids') new_entries = [p for p in parsed_feed.entries - if p.id in entries - self.seen.get(feed_id)] + if p.id in entries - seen_ids] for post in new_entries: self.plugin.log.info(post.title) - body = '%(title)s %(link)s' % post text.append(body) - xpost = {'title': html_escape(post.get('title', 'n/a'))} xpost['link'] = html_escape(post.get('link',)) xbody = '{title}'.format(**xpost) xhtml.append(xbody) - # Updating self.seen - self.seen[feed_id] = entries + # Updating self.seen, entries and cache headers + self._update_cache(feed, parsed_feed) if len(text) > 1: - self.plugin.send({'mhtml':'
'.join(xhtml), 'mbody':'\n'.join(text)}) + self.plugin.send(self.plugin.bot.room, + {'mhtml': '
'.join(xhtml), 'mbody': '\n'.join(text)}, + mtype='groupchat') def run(self): while not self.thread_killed: @@ -114,8 +128,12 @@ class FeedMonitor(threading.Thread): for feed in self.feeds_list: try: self.new_posts(feed) - except Exception as err: - self.plugin.log.error('feeds thread crashed: %s' % err) + except ConnectionError as err: # Non fatal exception + self.plugin.log.error('connection error on %s: %s', feed, err) + except URLError as err: # Non fatal exception + self.plugin.log.error('error for "%s": %s', feed, err.reason) + except Exception as err: # Unknown execption, killing thread anyway + self.plugin.log.error('feeds thread crashed: %s', err) self.plugin.log.error(''.join(traceback.format_exc())) self.thread_killed = True self.last_check = datetime.datetime.utcnow() @@ -126,22 +144,22 @@ class FeedMonitor(threading.Thread): class Feeds(Plugin): + """ + .. note:: + Feeds plugin depends on external module: **feedparser** + """ + + #: Time between feeds check TEMPO = 60 + #: Default feeds to monitor FEEDS = [ - # not working - # 'http://www.debian.org/security/dsa', - - # not working - # 'http://www.debian.org/News/news', - - # DPN in french - 'http://www.debian.org/News/weekly/dwn.fr.rdf', - + 'https://www.debian.org/security/dsa', + 'https://www.debian.org/News/news', + # Some packages + 'https://tracker.debian.org/pkg/prosody/rss', + 'https://tracker.debian.org/pkg/ejabberd/rss', # Misc - 'http://rss.gmane.org/topics/excerpts/gmane.linux.debian.devel.announce', - 'http://rss.gmane.org/gmane.linux.debian.user.security.announce', - 'http://planet-fr.debian.net/users/rss20.xml', - 'http://planet.debian.org/atom.xml', + 'https://planet.debian.org/atom.xml', ] def __init__(self, bot): @@ -154,18 +172,19 @@ class Feeds(Plugin): self.th_mon.thread_killed = True @botcmd - def feeds(self, message, args): - """feeds monitors debian project related feeds. - !feeds : registred feeds list - !feeds last : last check time""" + def feeds(self, rcv, args): + """Monitors debian project related feeds. + + * ``!feeds`` : registred feeds list + * ``!feeds last`` : last check time""" if 'last' in args: - self.send('Last feeds check: %s' % self.th_mon.last_check) + date = '{:%Y-%m-%d %H:%M} (utc)'.format(self.th_mon.last_check) + self.reply(rcv, f'Last feeds check: {date}') return - html = ['{1}'.format(html_escape(u), - html_escape(u[7:]) - ) for u in Feeds.FEEDS] - msg = { - 'mbody': 'Feeds:\n' + '\n'.join(Feeds.FEEDS), - 'mhtml': 'Feeds:
' + '
'.join(html), - } - self.send(msg) + html = ['{1}'.format( + html_escape(u), + html_escape('{1}{2}'.format(*urlparse(u))) + ) for u in Feeds.FEEDS] + msg = {'mbody': 'Feeds:\n' + '\n'.join(Feeds.FEEDS), + 'mhtml': 'Feeds:
' + '
'.join(html)} + self.reply(rcv, msg)