Message ID | 20220506174925.731698-1-stephen@that.guru |
---|---|
State | Accepted |
Headers | show |
Series | [2/n] parser: Ignore CFWS in Message-ID header | expand |
On Fri, 2022-05-06 at 18:49 +0100, Stephen Finucane wrote: > We recently started stripping comments and folding white space from the > In-Reply-To and References headers. Do so also for the Message-ID > header. > > Signed-off-by: Stephen Finucane <stephen@that.guru> > Related: #399 > --- > patchwork/parser.py | 43 ++++++++++++++++++++++++++-------- > patchwork/tests/test_parser.py | 32 +++++++++++++++++++++++++ > 2 files changed, 65 insertions(+), 10 deletions(-) > > diff --git patchwork/parser.py patchwork/parser.py > index 17cc2325..f219f466 100644 > --- patchwork/parser.py > +++ patchwork/parser.py > @@ -236,15 +236,14 @@ def _find_series_by_references(project, mail): > name, prefixes = clean_subject(subject, [project.linkname]) > version = parse_version(name, prefixes) > > - refs = find_references(mail) > - h = clean_header(mail.get('Message-Id')) > - if h: > - refs = [h] + refs > + msg_id = find_message_id(mail) > + refs = [msg_id] + find_references(mail) > > for ref in refs: > try: > series = SeriesReference.objects.get( > - msgid=ref[:255], project=project).series > + msgid=ref[:255], project=project, > + ).series > > if series.version != version: > # if the versions don't match, at least make sure these were > @@ -473,6 +472,34 @@ def find_headers(mail): > return '\n'.join(strings) > > > +def find_message_id(mail): > + """Extract the 'message-id' headers from a given mail and validate it. > + > + The validation here is simply checking that the Message-ID is correctly > + formatted per RFC-2822. However, even if it's not we'll attempt to use what > + we're given because a patch tracked in Patchwork with janky threading is > + better than no patch whatsoever. > + """ > + header = clean_header(mail.get('Message-Id')) > + if not header: > + raise ValueError("Broken 'Message-Id' header") > + > + msgid = _msgid_re.search(header) > + if msgid: > + msgid = msgid.group(0) > + else: > + # This is only info level since the admin likely can't do anything > + # about this > + logger.info( > + "Malformed 'Message-Id' header. The 'msg-id' component should be " > + "surrounded by angle brackets. Saving raw header. This may " > + "include comments and extra comments." > + ) I wonder if I should do this also for the 'In-Reply-To' field? I can't do it easily for the 'References' field since there's zero way to delineate things if I do, but that shouldn't matter since as long as we don't drop patches and things appear roughly in order, we don't really need the 'References' field: 'In-Reply-To' is enough to find *a* parent. Stephen > + msgid = header > + > + return msgid[:255] > + > + > def find_references(mail): > """Construct a list of possible reply message ids. > > @@ -1062,11 +1089,7 @@ def parse_mail(mail, list_id=None): > > # parse metadata > > - msgid = clean_header(mail.get('Message-Id')) > - if not msgid: > - raise ValueError("Broken 'Message-Id' header") > - msgid = msgid[:255] > - > + msgid = find_message_id(mail) > subject = mail.get('Subject') > name, prefixes = clean_subject(subject, [project.linkname]) > is_comment = subject_check(subject) > diff --git patchwork/tests/test_parser.py patchwork/tests/test_parser.py > index f65ad4b1..980a8afb 100644 > --- patchwork/tests/test_parser.py > +++ patchwork/tests/test_parser.py > @@ -1265,6 +1265,38 @@ class DuplicateMailTest(TestCase): > self.assertEqual(Cover.objects.count(), 1) > > > +class TestFindMessageID(TestCase): > + > + def test_find_message_id__missing_header(self): > + email = create_email('test') > + del email['Message-Id'] > + email['Message-Id'] = '' > + > + with self.assertRaises(ValueError) as cm: > + parser.find_message_id(email) > + self.assertIn("Broken 'Message-Id' header", str(cm.exeception)) > + > + def test_find_message_id__header_with_comments(self): > + """Test that we strip comments from the Message-ID field.""" > + message_id = '<xnzgy1de8d.fsf@rhel8.vm> (message ID with a comment)' > + email = create_email('test', msgid=message_id) > + > + expected = '<xnzgy1de8d.fsf@rhel8.vm>' > + actual = parser.find_message_id(email) > + > + self.assertEqual(expected, actual) > + > + def test_find_message_id__invalid_header_fallback(self): > + """Test that we accept badly formatted Message-ID fields.""" > + message_id = '5899d592-8c87-47d9-92b6-d34260ce1aa4@radware.com>' > + email = create_email('test', msgid=message_id) > + > + expected = '5899d592-8c87-47d9-92b6-d34260ce1aa4@radware.com>' > + actual = parser.find_message_id(email) > + > + self.assertEqual(expected, actual) > + > + > class TestFindReferences(TestCase): > > def test_find_references__header_with_comments(self):
diff --git patchwork/parser.py patchwork/parser.py index 17cc2325..f219f466 100644 --- patchwork/parser.py +++ patchwork/parser.py @@ -236,15 +236,14 @@ def _find_series_by_references(project, mail): name, prefixes = clean_subject(subject, [project.linkname]) version = parse_version(name, prefixes) - refs = find_references(mail) - h = clean_header(mail.get('Message-Id')) - if h: - refs = [h] + refs + msg_id = find_message_id(mail) + refs = [msg_id] + find_references(mail) for ref in refs: try: series = SeriesReference.objects.get( - msgid=ref[:255], project=project).series + msgid=ref[:255], project=project, + ).series if series.version != version: # if the versions don't match, at least make sure these were @@ -473,6 +472,34 @@ def find_headers(mail): return '\n'.join(strings) +def find_message_id(mail): + """Extract the 'message-id' headers from a given mail and validate it. + + The validation here is simply checking that the Message-ID is correctly + formatted per RFC-2822. However, even if it's not we'll attempt to use what + we're given because a patch tracked in Patchwork with janky threading is + better than no patch whatsoever. + """ + header = clean_header(mail.get('Message-Id')) + if not header: + raise ValueError("Broken 'Message-Id' header") + + msgid = _msgid_re.search(header) + if msgid: + msgid = msgid.group(0) + else: + # This is only info level since the admin likely can't do anything + # about this + logger.info( + "Malformed 'Message-Id' header. The 'msg-id' component should be " + "surrounded by angle brackets. Saving raw header. This may " + "include comments and extra comments." + ) + msgid = header + + return msgid[:255] + + def find_references(mail): """Construct a list of possible reply message ids. @@ -1062,11 +1089,7 @@ def parse_mail(mail, list_id=None): # parse metadata - msgid = clean_header(mail.get('Message-Id')) - if not msgid: - raise ValueError("Broken 'Message-Id' header") - msgid = msgid[:255] - + msgid = find_message_id(mail) subject = mail.get('Subject') name, prefixes = clean_subject(subject, [project.linkname]) is_comment = subject_check(subject) diff --git patchwork/tests/test_parser.py patchwork/tests/test_parser.py index f65ad4b1..980a8afb 100644 --- patchwork/tests/test_parser.py +++ patchwork/tests/test_parser.py @@ -1265,6 +1265,38 @@ class DuplicateMailTest(TestCase): self.assertEqual(Cover.objects.count(), 1) +class TestFindMessageID(TestCase): + + def test_find_message_id__missing_header(self): + email = create_email('test') + del email['Message-Id'] + email['Message-Id'] = '' + + with self.assertRaises(ValueError) as cm: + parser.find_message_id(email) + self.assertIn("Broken 'Message-Id' header", str(cm.exeception)) + + def test_find_message_id__header_with_comments(self): + """Test that we strip comments from the Message-ID field.""" + message_id = '<xnzgy1de8d.fsf@rhel8.vm> (message ID with a comment)' + email = create_email('test', msgid=message_id) + + expected = '<xnzgy1de8d.fsf@rhel8.vm>' + actual = parser.find_message_id(email) + + self.assertEqual(expected, actual) + + def test_find_message_id__invalid_header_fallback(self): + """Test that we accept badly formatted Message-ID fields.""" + message_id = '5899d592-8c87-47d9-92b6-d34260ce1aa4@radware.com>' + email = create_email('test', msgid=message_id) + + expected = '5899d592-8c87-47d9-92b6-d34260ce1aa4@radware.com>' + actual = parser.find_message_id(email) + + self.assertEqual(expected, actual) + + class TestFindReferences(TestCase): def test_find_references__header_with_comments(self):
We recently started stripping comments and folding white space from the In-Reply-To and References headers. Do so also for the Message-ID header. Signed-off-by: Stephen Finucane <stephen@that.guru> Related: #399 --- patchwork/parser.py | 43 ++++++++++++++++++++++++++-------- patchwork/tests/test_parser.py | 32 +++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 10 deletions(-)