7 Commits

Author SHA1 Message Date
luk3yx
5c8f1fd25a Add optional media proxy (disabled by default) 2024-10-05 16:09:51 +13:00
luk3yx
9535a18d87 Send mentions in new format 2024-10-05 14:27:28 +13:00
luk3yx
c822f4f4ac Convert <s> and <strike> to correct IRC code 2024-03-23 11:36:02 +13:00
luk3yx
a1b45952eb Bugfix 2023-06-23 19:13:52 +12:00
luk3yx
f00a14e316 Fix GitHub workflow 2022-12-26 11:20:49 +13:00
luk3yx
1baa8ef02f Fix parsing of "<br/>" 2022-12-26 11:18:59 +13:00
luk3yx
1a0d644b60 Try again in 15s if error returned from /v3/sync 2022-11-18 09:35:17 +13:00
5 changed files with 240 additions and 21 deletions

24
.github/workflows/pythonapp.yml vendored Normal file
View File

@@ -0,0 +1,24 @@
# From https://docs.github.com/en/free-pro-team@latest/actions/guides/building-and-testing-python
name: Test with pytest
on: [push]
jobs:
run-tests:
runs-on: ubuntu-latest
strategy:
matrix:
python_version: [3.8, 3]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python_version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python_version }}
- name: Install dependencies
run: python -m pip install miniirc pytest 'requests>=2.22.0,<3'
- name: Run pytest
run: pytest

View File

@@ -48,6 +48,35 @@ and `PART` commands should work as expected.
Note that events sent before the client connects to Matrix are ignored. Your
system must have an accurate clock for this to work properly.
## Downloading media
Matrix has recently started to require authentication for media endpoints. By
default, miniirc_matrix now translates media files into MXC URLs. It does,
however, have a built-in HTTP proxy (disabled by default, see below).
### Proxying requests (experimental)
**Warning: I don't know how secure this is, it uses Python's `http.server`**
If you want to convert media to a normal URL, for example for use with relay
bots or code that expects normal links, you can provide a `media_proxy_port`
argument to miniirc_matrix.Matrix.
```py
miniirc_matrix.Matrix('example.com', token='my_token',
media_proxy_port=8080)
```
This will start a HTTP server on `http://127.0.0.1:8080` to listen for ports.
The server only listens on localhost.
To expose this to the public, you must use a reverse proxy, and should set up
caching and some kind of rate limiting to prevent abuse. You can set the
`media_proxy_url` keyword argument to the public proxy URL.
A HMAC is created based on the API token and URL to prevent using the proxy to
fetch arbitrary attachment URLs.
## Installation
You can install `miniirc_matrix` with `pip install miniirc_matrix`.

View File

@@ -6,13 +6,14 @@
from __future__ import annotations
from collections.abc import Callable
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from typing import Any, Optional, TypeVar, overload
from urllib.parse import quote as _url_quote, urlparse as _urlparse
import functools, html.parser, itertools, json, math, re, threading, time, uuid
import miniirc, requests, traceback # type: ignore
import functools, hmac, html.parser, itertools, json, math, re, time, uuid
import miniirc, requests, threading, traceback # type: ignore
ver = (0, 0, 7)
ver = (0, 0, 12)
__version__ = '.'.join(map(str, ver))
@@ -45,13 +46,26 @@ def _register_event(event_name: str):
return _register
_formatting_re = re.compile(
_invisible_formatting_re = re.compile(
r'\x02|\x1d|\x1f|\x1e|\x11|\x16|\x0f'
r'|\x03([0-9]{1,2})?(?:,([0-9]{1,2}))?'
# Hex colours
r'|\x04([0-9a-fA-F]{6})?(?:,([0-9a-fA-F]{6}))?'
)
_full_formatting_re = re.compile(
_invisible_formatting_re.pattern +
# Matrix mentions
# These currently get inserted without any escaping, if HTML characters get
# added to the regex then make sure escaping gets added
r'|\B@[a-z0-9\._=\-/+]+:[a-zA-Z0-9_\-\.]+\.[a-zA-Z]{2,}(?!\.\w)\b'
)
_html_tags = {'\x02': 'strong', '\x1d': 'em', '\x1f': 'u', '\x1e': 'del',
'\x11': 'code'}
_media_url_re = re.compile(
r'^mxc://([A-Za-z0-9_\-\.]+/[A-Za-z0-9_\-\.]+)(?:/(.*))?$'
)
class _TagManager:
@@ -86,9 +100,15 @@ class _TagManager:
self.write_tags()
self.text.append(s)
@staticmethod
def _encode_attribute(param: str) -> str:
if param == 'href':
return param
return 'data-mx-' + param.replace('_', '-')
def open(self, tag: str, **kwargs: Optional[str]) -> None:
self.tags[tag] = ''.join(
f' data-mx-{param.replace("_", "-")}="{value}"'
f' {self._encode_attribute(param)}="{value}"'
for param, value in kwargs.items() if value is not None
)
@@ -167,20 +187,21 @@ def _irc_colour_to_hex(code: Optional[str]) -> Optional[str]:
return ''
def _irc_to_html(irc_msg: str) -> Optional[str]:
def _irc_to_html(irc_msg: str) -> tuple[Optional[str], set[str]]:
"""
Converts IRC formatting to Matrix HTML. Returns None if the message
contains no formatting.
"""
mentions: set[str] = set()
# Escaping quotes seems to make matrix-appservice-discord do strange things
irc_msg = html.escape(irc_msg, quote=False)
# If there is no formatting return immediately
it = _formatting_re.finditer(irc_msg)
it = _full_formatting_re.finditer(irc_msg)
first_match = next(it, None)
if first_match is None:
return None
return None, mentions
tags = _TagManager()
prev_end = start = 0
@@ -212,12 +233,20 @@ def _irc_to_html(irc_msg: str) -> Optional[str]:
elif char == '\x0f':
tags.fg = tags.bg = None
tags.tags.clear()
elif char == '@':
# Matrix mention
mention = match.group(0)
tags.open('a', href=f'https://matrix.to/#/{mention}')
tags.write(mention)
tags.close('a')
mentions.add(mention)
prev_end = match.end()
tags.write(irc_msg[prev_end:])
tags.tags.clear()
tags.write_tags()
return ''.join(tags.text).replace('\n', '<br>')
return ''.join(tags.text).replace('\n', '<br>'), mentions
# This simple space collapsing regex "collapses" newlines as well
@@ -234,6 +263,8 @@ class _MatrixHTMLParser(html.parser.HTMLParser):
irc_codes['b'] = irc_codes['strong']
irc_codes['i'] = irc_codes['em']
irc_codes['br'] = '\n'
irc_codes['s'] = irc_codes['del']
irc_codes['strike'] = irc_codes['del']
def __init__(self) -> None:
super().__init__()
@@ -257,7 +288,9 @@ class _MatrixHTMLParser(html.parser.HTMLParser):
if tag in ('mx-reply', 'script'):
self.in_reply -= 1
return
if tag in self.irc_codes:
if tag == 'br':
return
elif tag in self.irc_codes:
self.text.append(self.irc_codes[tag])
elif tag != 'font':
raise _UnknownTagError(tag)
@@ -281,6 +314,48 @@ def _matrix_html_to_irc(content: _Event) -> tuple[str, bool]:
return content.body[str], False
class _MediaProxyHandler(BaseHTTPRequestHandler):
irc: Matrix
def do_GET(self) -> None:
try:
with self.irc._download_media('mxc:/' + self.path) as resp:
if resp.status_code != 200:
self.send_error(resp.status_code)
return
self.send_response(200)
self.send_header('X-Content-Type-Options', 'nosniff')
self.send_header('Content-Security-Policy',
"default-src 'none'")
if 'Content-Length' in resp.headers:
self.send_header('Content-Length',
resp.headers['Content-Length'])
# Only allow probably safe content types
content_type = resp.headers.get('Content-Type', '')
if (content_type.startswith(('image/', 'audio/', 'video/')) or
content_type == 'text/plain'):
self.send_header('Content-Type', content_type)
else:
self.send_header('Content-Type',
'application/octet-stream')
self.end_headers()
# Copy content
for chunk in resp.iter_content(8192):
self.wfile.write(chunk)
except ValueError as exc:
self.send_error(400, explain=str(exc))
return
def log_message(self, format: str, *args) -> None:
if self.irc.debug:
super().log_message(format, *args)
class _InvalidEventError(Exception):
pass
@@ -359,9 +434,14 @@ class Matrix(miniirc.IRC):
connected: Optional[bool]
msglen = 4096
def __init__(self, ip: str, port: int = 0, nick: str = '', *args,
auto_connect: bool = True,
token: Optional[str] = None, **kwargs):
def __init__(
self, ip: str, port: int = 0, nick: str = '', *args,
auto_connect: bool = True,
token: Optional[str] = None,
media_proxy_port: Optional[int] = None,
media_proxy_url: Optional[str] = None,
**kwargs
) -> None:
# Cache _get_room_url
# This is done here so that each class instance gets its own cache and
# the cache doesn't store class instances.
@@ -381,6 +461,12 @@ class Matrix(miniirc.IRC):
if token:
self.token = token
self._media_proxy: Optional[ThreadingHTTPServer] = None
self._media_proxy_port = media_proxy_port
if media_proxy_port and not media_proxy_port:
media_proxy_url = f'http://127.0.0.1:{media_proxy_port}'
self._media_proxy_url = media_proxy_url and media_proxy_url.rstrip('/')
# Stop miniirc from trying to access the (non-existent) socket
kwargs['ping_interval'] = kwargs['ping_timeout'] = None
super().__init__(ip, port, nick, *args, auto_connect=False, **kwargs)
@@ -425,7 +511,7 @@ class Matrix(miniirc.IRC):
raise ValueError(f'Status code {res.status_code} returned')
self._baseurl = f'{baseurl}/_matrix/client/{api_version}'
self._media_baseurl = f'{baseurl}/_matrix/media/{api_version}'
self._media_baseurl = f'{baseurl}/_matrix/client/v1/media'
def __get(self, endpoint: str, timeout: int = 5, /,
**params: Optional[str | int]) -> Any:
@@ -454,6 +540,26 @@ class Matrix(miniirc.IRC):
return f'rooms/{_url_quote(room_id)}'
def __make_url_digest(self, path: str) -> str:
return hmac.digest(
b'miniirc_matrix hmac v1 ' + self.token.encode('ascii'),
path.encode('ascii'),
'sha256'
).hex()
def _download_media(self, url: str) -> requests.Response:
url_base, _, key = url.partition('?key=')
match = _media_url_re.match(url_base)
if not match:
raise ValueError('Invalid media URL')
path = match.group(1)
if not hmac.compare_digest(self.__make_url_digest(path), key):
raise ValueError('Invalid key parameter')
return self.__session.get(f'{self._media_baseurl}/download/{path}',
timeout=15, stream=True)
@functools.cached_property
def current_nick(self) -> str:
return self.__get('account/whoami')['user_id']
@@ -462,6 +568,7 @@ class Matrix(miniirc.IRC):
if self.connected is not None:
return
with self._send_lock:
self.connected = False
self._update_baseurl()
self.active_caps = self.ircv3_caps & {
'account-tag', 'echo-message', 'message-tags',
@@ -470,16 +577,34 @@ class Matrix(miniirc.IRC):
self.debug('Starting main loop (Matrix)')
self._start_main_loop()
if self._media_proxy_port:
self.debug('Starting media proxy')
class _handler(_MediaProxyHandler):
irc = self
self._media_proxy = ThreadingHTTPServer(
('127.0.0.1', self._media_proxy_port),
_handler,
)
th = threading.Thread(target=self._media_proxy.serve_forever)
th.daemon = True
th.start()
def disconnect(self) -> None:
self.connected = False
with self._send_lock:
self.connected = False
if self._media_proxy is not None:
self._media_proxy.shutdown()
self._media_proxy = None
def _main(self) -> None:
try:
self.__numeric('001', f'Welcome to Matrix {self.current_nick}')
next_batch: Optional[str] = None
self.connected = True
while self.connected:
req_time = time.monotonic()
try:
res = self.__get('sync', 35, timeout='30000',
since=next_batch)
@@ -495,6 +620,14 @@ class Matrix(miniirc.IRC):
if self.debug_file:
self.debug(json.dumps(res, indent=4))
if 'error' in res:
# TODO: Use self.debug or something
print(f'[miniirc_matrix] Error returned when trying to '
f'fetch /sync: {res["error"]!r}')
if self.persist:
self.debug('Trying again in 15 seconds...')
time.sleep(15)
continue
break
next_batch = res['next_batch']
if 'rooms' in res:
@@ -558,13 +691,16 @@ class Matrix(miniirc.IRC):
msgtype = 'm.text'
params: dict[str, Any]
if html_msg := _irc_to_html(msg):
html_msg, mentions = _irc_to_html(msg)
if html_msg:
params = {
'msgtype': msgtype,
'body': _formatting_re.sub('', msg),
'body': _invisible_formatting_re.sub('', msg),
'format': 'org.matrix.custom.html',
'formatted_body': html_msg,
}
if mentions:
params['m.mentions'] = {'user_ids': list(mentions)}
else:
# No formatting
params = {'msgtype': msgtype, 'body': msg}
@@ -650,8 +786,10 @@ class Matrix(miniirc.IRC):
msg: str
if 'url' in content:
msg = content.url[str]
if msg.startswith('mxc://'):
msg = f'{self._media_baseurl}/download/{msg[6:]}'
if self._media_proxy_url and (match := _media_url_re.match(msg)):
path = match.group(1)
key = self.__make_url_digest(path)
msg = f'{self._media_proxy_url}/{path}?key={key}'
else:
msg, html_parsed_ok = _matrix_html_to_irc(content)

View File

@@ -5,7 +5,7 @@ from setuptools import setup
setup(
name='miniirc_matrix',
version='0.0.7',
version='0.0.12',
py_modules=['miniirc_matrix'],
author='luk3yx',
description='A Matrix wrapper for miniirc.',

28
test_formatting.py Normal file
View File

@@ -0,0 +1,28 @@
from miniirc_matrix import _Event, _irc_to_html, _matrix_html_to_irc
def test_irc_to_html():
assert _irc_to_html('Hello world!') == (None, set())
assert _irc_to_html('\x02Bold') == ('<strong>Bold</strong>', set())
assert (_irc_to_html('\x021 \x1d2\x02 3') ==
('<strong>1 <em>2</em></strong><em> 3</em>', set()))
assert (_irc_to_html('@test:example.com: \x1dHello') ==
('<a href="https://matrix.to/#/@test:example.com">'
'@test:example.com</a>: <em>Hello</em>', {'@test:example.com'}))
def html_to_irc(html):
res, html_parsed_ok = _matrix_html_to_irc(_Event({
'format': 'org.matrix.custom.html',
'formatted_body': html,
}))
assert html_parsed_ok
return res
def test_html_to_irc():
assert html_to_irc('Hello <b>world</b>!') == 'Hello \x02world\x02!'
assert html_to_irc('Hello\nworld!') == 'Hello\nworld!'
assert html_to_irc('Hello<br>world!') == 'Hello\nworld!'
assert html_to_irc('Hello<br/>world!') == 'Hello\nworld!'