diff mbox series

[v1,3/9] parser: Parse "Depends-on" tags in emails

Message ID 20240617221900.156155-3-ahassick@iol.unh.edu
State Changes Requested
Headers show
Series [v1,1/9] api: Add fields to series detail view | expand

Commit Message

Adam Hassick June 17, 2024, 10:18 p.m. UTC
Add a new function to parse "Depends-on" tags to the parser. The value
may either be a "reference" to a patch or series object or the web URL
to the object. For example, a reference may look like "patch-1234" or
"series-5678". When this tag is found, the parser will add the series
(or the series the patch belongs to) as a dependency to the patch series
it is creating.

Signed-off-by: Adam Hassick <ahassick@iol.unh.edu>
---
 patchwork/parser.py | 109 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 108 insertions(+), 1 deletion(-)

Comments

Stephen Finucane July 12, 2024, 4:01 p.m. UTC | #1
On Mon, 2024-06-17 at 18:18 -0400, Adam Hassick wrote:
> Add a new function to parse "Depends-on" tags to the parser. The value
> may either be a "reference" to a patch or series object or the web URL
> to the object. For example, a reference may look like "patch-1234" or
> "series-5678". When this tag is found, the parser will add the series
> (or the series the patch belongs to) as a dependency to the patch series
> it is creating.

I know the DPDK folks are using this integer ID-based format for dependencies
(per [1]), but I'm not a huge fan of it. My concerns are two-fold: firstly,
we've been trying to abstract/hide the integer IDs in favour of message IDs over
recent releases and we no longer expose in most of the web UI (they're still
exposed via the REST API but that's for historical reasons). This would be a
step backwards on this path. Secondly, something like 'series-5678' gives the
casual observer very little information about the location of that dependency.
You'd need to know that Patchwork was being used as well as the specific
location of the Patchwork instance in question. I wonder if, rather than
supporting this syntax, we could support Message IDs (and URLs) instead? That's
a unique identifier that is searchable both online and off (assuming you're
saving mail locally). DPDK could of course choose to add patches on top to
support their existing syntax, though they could also choose to migrate to the
new pattern. wdyt?

Some other comments on this patch as-is below.

[1] https://github.com/getpatchwork/git-pw/issues/71

> 
> Signed-off-by: Adam Hassick <ahassick@iol.unh.edu>
> ---
>  patchwork/parser.py | 109 +++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 108 insertions(+), 1 deletion(-)
> 
> diff --git a/patchwork/parser.py b/patchwork/parser.py
> index 09a53a0..90ec63b 100644
> --- a/patchwork/parser.py
> +++ b/patchwork/parser.py
> @@ -14,11 +14,14 @@ from email.errors import HeaderParseError
>  from fnmatch import fnmatch
>  import logging
>  import re
> +from urllib.parse import urlparse
>  
>  from django.contrib.auth.models import User
>  from django.db.utils import IntegrityError
>  from django.db import transaction
>  from django.utils import timezone as tz_utils
> +from django.urls import resolve, Resolver404
> +from django.conf import settings
>  
>  from patchwork.models import Cover
>  from patchwork.models import CoverComment
> @@ -32,7 +35,6 @@ from patchwork.models import Series
>  from patchwork.models import SeriesReference
>  from patchwork.models import State
>  
> -
>  _msgid_re = re.compile(r'<[^>]+>')
>  _hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
>  _filename_re = re.compile(r'^(---|\+\+\+) (\S+)')
> @@ -1054,6 +1056,102 @@ def parse_pull_request(content):
>      return None
>  
>  
> +def find_series_from_url(url):
> +    """
> +    Get series or patch from URL.
> +    """
> +
> +    parse_result = urlparse(url)
> +
> +    # Resolve the URL path to see if this is a patch or series detail URL.
> +    try:
> +        result = resolve(parse_result.path)
> +    except Resolver404:
> +        return None
> +
> +    if result.view_name == 'patch-list' and parse_result.query:
> +        # Parse the query string.
> +        # This can be replaced with something much friendlier once the
> +        # series detail view is implemented.
> +        series_query_param = next(
> +            filter(
> +                lambda x: len(x) == 2 and x[0] == 'series',
> +                map(lambda x: x.split('='), parse_result.query.split('&')),
> +            )
> +        )
> +
> +        if series_query_param:

nit:

    if not series_query_param:
        return None

    series_id = series_query_param[1]

    ...

> +            series_id = series_query_param[1]
> +
> +            try:
> +                series_id_num = int(series_id)
> +            except ValueError:
> +                return None
> +
> +            return Series.objects.filter(id=series_id_num).first()
> +    elif result.view_name == 'patch-detail':
> +        msgid = Patch.decode_msgid(result.kwargs['msgid'])
> +        patch = Patch.objects.filter(msgid=msgid).first()
> +
> +        if patch:
> +            return patch.series
> +
> +
> +def find_series_from_ref(match):
> +    (obj_type, obj_id_str) = match

nit: don't need those extra brackets

> +
> +    try:
> +        object_id = int(obj_id_str)
> +    except ValueError:
> +        return None
> +
> +    if obj_type == 'series':
> +        series = Series.objects.filter(id=object_id).first()

nit:

    return Series.objects.filter(...)

> +    elif obj_type == 'patch':
> +        patch = Patch.objects.filter(id=object_id).first()
> +
> +        if not patch:
> +            return None
> +
> +        series = patch.series

nit:

    return patch.series

> +
> +    return series
> +
> +
> +def parse_depends_on(content):
> +    """Parses any dependency hints in the comments."""
> +    depends_patterns = [
> +        (
> +            re.compile(
> +                r'^Depends-on: ((?:patch)?(?:series)?)-([\d]+)(?: \("[^\n\r"]+"\))?\s*$',
> +                re.MULTILINE | re.IGNORECASE,
> +            ),
> +            find_series_from_ref,
> +        ),
> +        (
> +            re.compile(
> +                r'^Depends-on: (http[s]?:\/\/[\w\d\-.\/=&@:%?_\+()]+)\s*$',
> +                re.MULTILINE | re.IGNORECASE,
> +            ),
> +            find_series_from_url,
> +        ),
> +    ]
> +
> +    dependencies = list()

nit: Can you use a literal?

    dependencies = []

> +
> +    for pat, mapper in depends_patterns:
> +        matches = pat.findall(content)
> +
> +        # Produce a list of tuples containing type and ID of each dependency.
> +        # Eliminate elements where we could not parse the ID as an integer.
> +        dependencies.extend(
> +            filter(lambda series: series is not None, map(mapper, matches))
> +        )

This is a little too clever for my liking :)

     for match in re.findall(
         r'^Depends-on: ((?:patch)?(?:series)?)-([\d]+)(?:
\("[^\n\r"]+"\))?\s*$',
         re.MULTILINE | re.IGNORECASE,
     ):
         if series := find_series_from_ref(match):
             dependencies.append(series)

     ...

> +
> +    # Return list of series objects to depend on.
> +    return dependencies
> +
> +
>  def find_state(mail):
>      """Return the state with the given name or the default."""
>      state_name = clean_header(mail.get('X-Patchwork-State', ''))
> @@ -1171,6 +1269,12 @@ def parse_mail(mail, list_id=None):
>  
>      pull_url = parse_pull_request(message)
>  
> +    # Only look for "Depends-on" tags if the setting is enabled.
> +    if settings.ENABLE_DEPENDS_ON_PARSING:
> +        dependencies = parse_depends_on(message)
> +    else:
> +        dependencies = []
> +

The patch that adds this should come earlier in the series (before this one). We
can discuss whether it's needed there. However, if we keep it can you move the
check for it into 'parse_depends_on' and move the call to 'parse_depends_on'
below so that we can avoid parsing dependencies for non-cover letter/patch
emails?

>      # build objects
>  
>      if not is_comment and (diff or pull_url):  # patches or pull requests
> @@ -1308,6 +1412,8 @@ def parse_mail(mail, list_id=None):
>              # always have a series
>              series.add_patch(patch, x)
>  
> +        series.add_dependencies(dependencies)

    dependencies = parse_depends_on(message)
    series.add_dependencies(dependencies)

> +
>          return patch
>      elif x == 0:  # (potential) cover letters
>          # if refs are empty, it's implicitly a cover letter. If not,
> @@ -1374,6 +1480,7 @@ def parse_mail(mail, list_id=None):
>              logger.debug('Cover letter saved')
>  
>              series.add_cover_letter(cover_letter)
> +            series.add_dependencies(dependencies)

    dependencies = parse_depends_on(message)
    series.add_dependencies(dependencies)

>              return cover_letter
>
Adam Hassick July 12, 2024, 7:28 p.m. UTC | #2
On Fri, Jul 12, 2024 at 12:02 PM Stephen Finucane <stephen@that.guru> wrote:
>
> On Mon, 2024-06-17 at 18:18 -0400, Adam Hassick wrote:
> > Add a new function to parse "Depends-on" tags to the parser. The value
> > may either be a "reference" to a patch or series object or the web URL
> > to the object. For example, a reference may look like "patch-1234" or
> > "series-5678". When this tag is found, the parser will add the series
> > (or the series the patch belongs to) as a dependency to the patch series
> > it is creating.
>
> I know the DPDK folks are using this integer ID-based format for dependencies
> (per [1]), but I'm not a huge fan of it. My concerns are two-fold: firstly,
> we've been trying to abstract/hide the integer IDs in favour of message IDs over
> recent releases and we no longer expose in most of the web UI (they're still
> exposed via the REST API but that's for historical reasons). This would be a
> step backwards on this path. Secondly, something like 'series-5678' gives the
> casual observer very little information about the location of that dependency.
> You'd need to know that Patchwork was being used as well as the specific
> location of the Patchwork instance in question. I wonder if, rather than
> supporting this syntax, we could support Message IDs (and URLs) instead? That's
> a unique identifier that is searchable both online and off (assuming you're
> saving mail locally). DPDK could of course choose to add patches on top to
> support their existing syntax, though they could also choose to migrate to the
> new pattern. wdyt?

Sure. We can use message IDs instead of the reference format we came
up with. I'll reach out to the DPDK community and see if they are
receptive to changing the procedure. My guess is they would prefer
that over running Patchwork with custom patches applied.

> Some other comments on this patch as-is below.
>
> [1] https://github.com/getpatchwork/git-pw/issues/71
>
> >
> > Signed-off-by: Adam Hassick <ahassick@iol.unh.edu>
> > ---
> >  patchwork/parser.py | 109 +++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 108 insertions(+), 1 deletion(-)
> >
> > diff --git a/patchwork/parser.py b/patchwork/parser.py
> > index 09a53a0..90ec63b 100644
> > --- a/patchwork/parser.py
> > +++ b/patchwork/parser.py
> > @@ -14,11 +14,14 @@ from email.errors import HeaderParseError
> >  from fnmatch import fnmatch
> >  import logging
> >  import re
> > +from urllib.parse import urlparse
> >
> >  from django.contrib.auth.models import User
> >  from django.db.utils import IntegrityError
> >  from django.db import transaction
> >  from django.utils import timezone as tz_utils
> > +from django.urls import resolve, Resolver404
> > +from django.conf import settings
> >
> >  from patchwork.models import Cover
> >  from patchwork.models import CoverComment
> > @@ -32,7 +35,6 @@ from patchwork.models import Series
> >  from patchwork.models import SeriesReference
> >  from patchwork.models import State
> >
> > -
> >  _msgid_re = re.compile(r'<[^>]+>')
> >  _hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
> >  _filename_re = re.compile(r'^(---|\+\+\+) (\S+)')
> > @@ -1054,6 +1056,102 @@ def parse_pull_request(content):
> >      return None
> >
> >
> > +def find_series_from_url(url):
> > +    """
> > +    Get series or patch from URL.
> > +    """
> > +
> > +    parse_result = urlparse(url)
> > +
> > +    # Resolve the URL path to see if this is a patch or series detail URL.
> > +    try:
> > +        result = resolve(parse_result.path)
> > +    except Resolver404:
> > +        return None
> > +
> > +    if result.view_name == 'patch-list' and parse_result.query:
> > +        # Parse the query string.
> > +        # This can be replaced with something much friendlier once the
> > +        # series detail view is implemented.
> > +        series_query_param = next(
> > +            filter(
> > +                lambda x: len(x) == 2 and x[0] == 'series',
> > +                map(lambda x: x.split('='), parse_result.query.split('&')),
> > +            )
> > +        )
> > +
> > +        if series_query_param:
>
> nit:
>
>     if not series_query_param:
>         return None
>
>     series_id = series_query_param[1]
>
>     ...
>
> > +            series_id = series_query_param[1]
> > +
> > +            try:
> > +                series_id_num = int(series_id)
> > +            except ValueError:
> > +                return None
> > +
> > +            return Series.objects.filter(id=series_id_num).first()
> > +    elif result.view_name == 'patch-detail':
> > +        msgid = Patch.decode_msgid(result.kwargs['msgid'])
> > +        patch = Patch.objects.filter(msgid=msgid).first()
> > +
> > +        if patch:
> > +            return patch.series
> > +
> > +
> > +def find_series_from_ref(match):
> > +    (obj_type, obj_id_str) = match
>
> nit: don't need those extra brackets
>
> > +
> > +    try:
> > +        object_id = int(obj_id_str)
> > +    except ValueError:
> > +        return None
> > +
> > +    if obj_type == 'series':
> > +        series = Series.objects.filter(id=object_id).first()
>
> nit:
>
>     return Series.objects.filter(...)
>
> > +    elif obj_type == 'patch':
> > +        patch = Patch.objects.filter(id=object_id).first()
> > +
> > +        if not patch:
> > +            return None
> > +
> > +        series = patch.series
>
> nit:
>
>     return patch.series
>
> > +
> > +    return series
> > +
> > +
> > +def parse_depends_on(content):
> > +    """Parses any dependency hints in the comments."""
> > +    depends_patterns = [
> > +        (
> > +            re.compile(
> > +                r'^Depends-on: ((?:patch)?(?:series)?)-([\d]+)(?: \("[^\n\r"]+"\))?\s*$',
> > +                re.MULTILINE | re.IGNORECASE,
> > +            ),
> > +            find_series_from_ref,
> > +        ),
> > +        (
> > +            re.compile(
> > +                r'^Depends-on: (http[s]?:\/\/[\w\d\-.\/=&@:%?_\+()]+)\s*$',
> > +                re.MULTILINE | re.IGNORECASE,
> > +            ),
> > +            find_series_from_url,
> > +        ),
> > +    ]
> > +
> > +    dependencies = list()
>
> nit: Can you use a literal?
>
>     dependencies = []
>
> > +
> > +    for pat, mapper in depends_patterns:
> > +        matches = pat.findall(content)
> > +
> > +        # Produce a list of tuples containing type and ID of each dependency.
> > +        # Eliminate elements where we could not parse the ID as an integer.
> > +        dependencies.extend(
> > +            filter(lambda series: series is not None, map(mapper, matches))
> > +        )
>
> This is a little too clever for my liking :)
>
>      for match in re.findall(
>          r'^Depends-on: ((?:patch)?(?:series)?)-([\d]+)(?:
> \("[^\n\r"]+"\))?\s*$',
>          re.MULTILINE | re.IGNORECASE,
>      ):
>          if series := find_series_from_ref(match):
>              dependencies.append(series)
>
>      ...

I was looking to make this as easily extensible as possible, so that
if another contributor in the future wants to come along and add
another format to refer to a series/patch it can be added easily.
But we can unwind this into two loops, one for message ID and one for
URL parsing.

> > +
> > +    # Return list of series objects to depend on.
> > +    return dependencies
> > +
> > +
> >  def find_state(mail):
> >      """Return the state with the given name or the default."""
> >      state_name = clean_header(mail.get('X-Patchwork-State', ''))
> > @@ -1171,6 +1269,12 @@ def parse_mail(mail, list_id=None):
> >
> >      pull_url = parse_pull_request(message)
> >
> > +    # Only look for "Depends-on" tags if the setting is enabled.
> > +    if settings.ENABLE_DEPENDS_ON_PARSING:
> > +        dependencies = parse_depends_on(message)
> > +    else:
> > +        dependencies = []
> > +
>
> The patch that adds this should come earlier in the series (before this one). We
> can discuss whether it's needed there. However, if we keep it can you move the
> check for it into 'parse_depends_on' and move the call to 'parse_depends_on'
> below so that we can avoid parsing dependencies for non-cover letter/patch
> emails?

Makes sense to me.

>
> >      # build objects
> >
> >      if not is_comment and (diff or pull_url):  # patches or pull requests
> > @@ -1308,6 +1412,8 @@ def parse_mail(mail, list_id=None):
> >              # always have a series
> >              series.add_patch(patch, x)
> >
> > +        series.add_dependencies(dependencies)
>
>     dependencies = parse_depends_on(message)
>     series.add_dependencies(dependencies)
>
> > +
> >          return patch
> >      elif x == 0:  # (potential) cover letters
> >          # if refs are empty, it's implicitly a cover letter. If not,
> > @@ -1374,6 +1480,7 @@ def parse_mail(mail, list_id=None):
> >              logger.debug('Cover letter saved')
> >
> >              series.add_cover_letter(cover_letter)
> > +            series.add_dependencies(dependencies)
>
>     dependencies = parse_depends_on(message)
>     series.add_dependencies(dependencies)
>
> >              return cover_letter
> >
>
Stephen Finucane July 18, 2024, 4:23 p.m. UTC | #3
On Fri, 2024-07-12 at 15:28 -0400, Adam Hassick wrote:
> On Fri, Jul 12, 2024 at 12:02 PM Stephen Finucane <stephen@that.guru> wrote:
> > 
> > On Mon, 2024-06-17 at 18:18 -0400, Adam Hassick wrote:
> > > Add a new function to parse "Depends-on" tags to the parser. The value
> > > may either be a "reference" to a patch or series object or the web URL
> > > to the object. For example, a reference may look like "patch-1234" or
> > > "series-5678". When this tag is found, the parser will add the series
> > > (or the series the patch belongs to) as a dependency to the patch series
> > > it is creating.
> > 
> > I know the DPDK folks are using this integer ID-based format for dependencies
> > (per [1]), but I'm not a huge fan of it. My concerns are two-fold: firstly,
> > we've been trying to abstract/hide the integer IDs in favour of message IDs over
> > recent releases and we no longer expose in most of the web UI (they're still
> > exposed via the REST API but that's for historical reasons). This would be a
> > step backwards on this path. Secondly, something like 'series-5678' gives the
> > casual observer very little information about the location of that dependency.
> > You'd need to know that Patchwork was being used as well as the specific
> > location of the Patchwork instance in question. I wonder if, rather than
> > supporting this syntax, we could support Message IDs (and URLs) instead? That's
> > a unique identifier that is searchable both online and off (assuming you're
> > saving mail locally). DPDK could of course choose to add patches on top to
> > support their existing syntax, though they could also choose to migrate to the
> > new pattern. wdyt?
> 
> Sure. We can use message IDs instead of the reference format we came
> up with. I'll reach out to the DPDK community and see if they are
> receptive to changing the procedure. My guess is they would prefer
> that over running Patchwork with custom patches applied.

Any updates on this? I'm assuming we're waiting on that before I can expect a
v2? Or is the v2 gone elsewhere and I simply haven't spotted it?

Stephen

> 
> > Some other comments on this patch as-is below.
> > 
> > [1] https://github.com/getpatchwork/git-pw/issues/71
> > 
> > > 
> > > Signed-off-by: Adam Hassick <ahassick@iol.unh.edu>
> > > ---
> > >  patchwork/parser.py | 109 +++++++++++++++++++++++++++++++++++++++++++-
> > >  1 file changed, 108 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/patchwork/parser.py b/patchwork/parser.py
> > > index 09a53a0..90ec63b 100644
> > > --- a/patchwork/parser.py
> > > +++ b/patchwork/parser.py
> > > @@ -14,11 +14,14 @@ from email.errors import HeaderParseError
> > >  from fnmatch import fnmatch
> > >  import logging
> > >  import re
> > > +from urllib.parse import urlparse
> > > 
> > >  from django.contrib.auth.models import User
> > >  from django.db.utils import IntegrityError
> > >  from django.db import transaction
> > >  from django.utils import timezone as tz_utils
> > > +from django.urls import resolve, Resolver404
> > > +from django.conf import settings
> > > 
> > >  from patchwork.models import Cover
> > >  from patchwork.models import CoverComment
> > > @@ -32,7 +35,6 @@ from patchwork.models import Series
> > >  from patchwork.models import SeriesReference
> > >  from patchwork.models import State
> > > 
> > > -
> > >  _msgid_re = re.compile(r'<[^>]+>')
> > >  _hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
> > >  _filename_re = re.compile(r'^(---|\+\+\+) (\S+)')
> > > @@ -1054,6 +1056,102 @@ def parse_pull_request(content):
> > >      return None
> > > 
> > > 
> > > +def find_series_from_url(url):
> > > +    """
> > > +    Get series or patch from URL.
> > > +    """
> > > +
> > > +    parse_result = urlparse(url)
> > > +
> > > +    # Resolve the URL path to see if this is a patch or series detail URL.
> > > +    try:
> > > +        result = resolve(parse_result.path)
> > > +    except Resolver404:
> > > +        return None
> > > +
> > > +    if result.view_name == 'patch-list' and parse_result.query:
> > > +        # Parse the query string.
> > > +        # This can be replaced with something much friendlier once the
> > > +        # series detail view is implemented.
> > > +        series_query_param = next(
> > > +            filter(
> > > +                lambda x: len(x) == 2 and x[0] == 'series',
> > > +                map(lambda x: x.split('='), parse_result.query.split('&')),
> > > +            )
> > > +        )
> > > +
> > > +        if series_query_param:
> > 
> > nit:
> > 
> >     if not series_query_param:
> >         return None
> > 
> >     series_id = series_query_param[1]
> > 
> >     ...
> > 
> > > +            series_id = series_query_param[1]
> > > +
> > > +            try:
> > > +                series_id_num = int(series_id)
> > > +            except ValueError:
> > > +                return None
> > > +
> > > +            return Series.objects.filter(id=series_id_num).first()
> > > +    elif result.view_name == 'patch-detail':
> > > +        msgid = Patch.decode_msgid(result.kwargs['msgid'])
> > > +        patch = Patch.objects.filter(msgid=msgid).first()
> > > +
> > > +        if patch:
> > > +            return patch.series
> > > +
> > > +
> > > +def find_series_from_ref(match):
> > > +    (obj_type, obj_id_str) = match
> > 
> > nit: don't need those extra brackets
> > 
> > > +
> > > +    try:
> > > +        object_id = int(obj_id_str)
> > > +    except ValueError:
> > > +        return None
> > > +
> > > +    if obj_type == 'series':
> > > +        series = Series.objects.filter(id=object_id).first()
> > 
> > nit:
> > 
> >     return Series.objects.filter(...)
> > 
> > > +    elif obj_type == 'patch':
> > > +        patch = Patch.objects.filter(id=object_id).first()
> > > +
> > > +        if not patch:
> > > +            return None
> > > +
> > > +        series = patch.series
> > 
> > nit:
> > 
> >     return patch.series
> > 
> > > +
> > > +    return series
> > > +
> > > +
> > > +def parse_depends_on(content):
> > > +    """Parses any dependency hints in the comments."""
> > > +    depends_patterns = [
> > > +        (
> > > +            re.compile(
> > > +                r'^Depends-on: ((?:patch)?(?:series)?)-([\d]+)(?: \("[^\n\r"]+"\))?\s*$',
> > > +                re.MULTILINE | re.IGNORECASE,
> > > +            ),
> > > +            find_series_from_ref,
> > > +        ),
> > > +        (
> > > +            re.compile(
> > > +                r'^Depends-on: (http[s]?:\/\/[\w\d\-.\/=&@:%?_\+()]+)\s*$',
> > > +                re.MULTILINE | re.IGNORECASE,
> > > +            ),
> > > +            find_series_from_url,
> > > +        ),
> > > +    ]
> > > +
> > > +    dependencies = list()
> > 
> > nit: Can you use a literal?
> > 
> >     dependencies = []
> > 
> > > +
> > > +    for pat, mapper in depends_patterns:
> > > +        matches = pat.findall(content)
> > > +
> > > +        # Produce a list of tuples containing type and ID of each dependency.
> > > +        # Eliminate elements where we could not parse the ID as an integer.
> > > +        dependencies.extend(
> > > +            filter(lambda series: series is not None, map(mapper, matches))
> > > +        )
> > 
> > This is a little too clever for my liking :)
> > 
> >      for match in re.findall(
> >          r'^Depends-on: ((?:patch)?(?:series)?)-([\d]+)(?:
> > \("[^\n\r"]+"\))?\s*$',
> >          re.MULTILINE | re.IGNORECASE,
> >      ):
> >          if series := find_series_from_ref(match):
> >              dependencies.append(series)
> > 
> >      ...
> 
> I was looking to make this as easily extensible as possible, so that
> if another contributor in the future wants to come along and add
> another format to refer to a series/patch it can be added easily.
> But we can unwind this into two loops, one for message ID and one for
> URL parsing.
> 
> > > +
> > > +    # Return list of series objects to depend on.
> > > +    return dependencies
> > > +
> > > +
> > >  def find_state(mail):
> > >      """Return the state with the given name or the default."""
> > >      state_name = clean_header(mail.get('X-Patchwork-State', ''))
> > > @@ -1171,6 +1269,12 @@ def parse_mail(mail, list_id=None):
> > > 
> > >      pull_url = parse_pull_request(message)
> > > 
> > > +    # Only look for "Depends-on" tags if the setting is enabled.
> > > +    if settings.ENABLE_DEPENDS_ON_PARSING:
> > > +        dependencies = parse_depends_on(message)
> > > +    else:
> > > +        dependencies = []
> > > +
> > 
> > The patch that adds this should come earlier in the series (before this one). We
> > can discuss whether it's needed there. However, if we keep it can you move the
> > check for it into 'parse_depends_on' and move the call to 'parse_depends_on'
> > below so that we can avoid parsing dependencies for non-cover letter/patch
> > emails?
> 
> Makes sense to me.
> 
> > 
> > >      # build objects
> > > 
> > >      if not is_comment and (diff or pull_url):  # patches or pull requests
> > > @@ -1308,6 +1412,8 @@ def parse_mail(mail, list_id=None):
> > >              # always have a series
> > >              series.add_patch(patch, x)
> > > 
> > > +        series.add_dependencies(dependencies)
> > 
> >     dependencies = parse_depends_on(message)
> >     series.add_dependencies(dependencies)
> > 
> > > +
> > >          return patch
> > >      elif x == 0:  # (potential) cover letters
> > >          # if refs are empty, it's implicitly a cover letter. If not,
> > > @@ -1374,6 +1480,7 @@ def parse_mail(mail, list_id=None):
> > >              logger.debug('Cover letter saved')
> > > 
> > >              series.add_cover_letter(cover_letter)
> > > +            series.add_dependencies(dependencies)
> > 
> >     dependencies = parse_depends_on(message)
> >     series.add_dependencies(dependencies)
> > 
> > >              return cover_letter
> > > 
> >
Adam Hassick July 22, 2024, 3:59 p.m. UTC | #4
There's an ongoing discussion, no v2 submitted yet. I will submit it
to this list when it's ready.
Thanks for checking in.

On Thu, Jul 18, 2024 at 12:24 PM Stephen Finucane <stephen@that.guru> wrote:
>
> On Fri, 2024-07-12 at 15:28 -0400, Adam Hassick wrote:
> > On Fri, Jul 12, 2024 at 12:02 PM Stephen Finucane <stephen@that.guru> wrote:
> > >
> > > On Mon, 2024-06-17 at 18:18 -0400, Adam Hassick wrote:
> > > > Add a new function to parse "Depends-on" tags to the parser. The value
> > > > may either be a "reference" to a patch or series object or the web URL
> > > > to the object. For example, a reference may look like "patch-1234" or
> > > > "series-5678". When this tag is found, the parser will add the series
> > > > (or the series the patch belongs to) as a dependency to the patch series
> > > > it is creating.
> > >
> > > I know the DPDK folks are using this integer ID-based format for dependencies
> > > (per [1]), but I'm not a huge fan of it. My concerns are two-fold: firstly,
> > > we've been trying to abstract/hide the integer IDs in favour of message IDs over
> > > recent releases and we no longer expose in most of the web UI (they're still
> > > exposed via the REST API but that's for historical reasons). This would be a
> > > step backwards on this path. Secondly, something like 'series-5678' gives the
> > > casual observer very little information about the location of that dependency.
> > > You'd need to know that Patchwork was being used as well as the specific
> > > location of the Patchwork instance in question. I wonder if, rather than
> > > supporting this syntax, we could support Message IDs (and URLs) instead? That's
> > > a unique identifier that is searchable both online and off (assuming you're
> > > saving mail locally). DPDK could of course choose to add patches on top to
> > > support their existing syntax, though they could also choose to migrate to the
> > > new pattern. wdyt?
> >
> > Sure. We can use message IDs instead of the reference format we came
> > up with. I'll reach out to the DPDK community and see if they are
> > receptive to changing the procedure. My guess is they would prefer
> > that over running Patchwork with custom patches applied.
>
> Any updates on this? I'm assuming we're waiting on that before I can expect a
> v2? Or is the v2 gone elsewhere and I simply haven't spotted it?
>
> Stephen
>
> >
> > > Some other comments on this patch as-is below.
> > >
> > > [1] https://github.com/getpatchwork/git-pw/issues/71
> > >
> > > >
> > > > Signed-off-by: Adam Hassick <ahassick@iol.unh.edu>
> > > > ---
> > > >  patchwork/parser.py | 109 +++++++++++++++++++++++++++++++++++++++++++-
> > > >  1 file changed, 108 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/patchwork/parser.py b/patchwork/parser.py
> > > > index 09a53a0..90ec63b 100644
> > > > --- a/patchwork/parser.py
> > > > +++ b/patchwork/parser.py
> > > > @@ -14,11 +14,14 @@ from email.errors import HeaderParseError
> > > >  from fnmatch import fnmatch
> > > >  import logging
> > > >  import re
> > > > +from urllib.parse import urlparse
> > > >
> > > >  from django.contrib.auth.models import User
> > > >  from django.db.utils import IntegrityError
> > > >  from django.db import transaction
> > > >  from django.utils import timezone as tz_utils
> > > > +from django.urls import resolve, Resolver404
> > > > +from django.conf import settings
> > > >
> > > >  from patchwork.models import Cover
> > > >  from patchwork.models import CoverComment
> > > > @@ -32,7 +35,6 @@ from patchwork.models import Series
> > > >  from patchwork.models import SeriesReference
> > > >  from patchwork.models import State
> > > >
> > > > -
> > > >  _msgid_re = re.compile(r'<[^>]+>')
> > > >  _hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
> > > >  _filename_re = re.compile(r'^(---|\+\+\+) (\S+)')
> > > > @@ -1054,6 +1056,102 @@ def parse_pull_request(content):
> > > >      return None
> > > >
> > > >
> > > > +def find_series_from_url(url):
> > > > +    """
> > > > +    Get series or patch from URL.
> > > > +    """
> > > > +
> > > > +    parse_result = urlparse(url)
> > > > +
> > > > +    # Resolve the URL path to see if this is a patch or series detail URL.
> > > > +    try:
> > > > +        result = resolve(parse_result.path)
> > > > +    except Resolver404:
> > > > +        return None
> > > > +
> > > > +    if result.view_name == 'patch-list' and parse_result.query:
> > > > +        # Parse the query string.
> > > > +        # This can be replaced with something much friendlier once the
> > > > +        # series detail view is implemented.
> > > > +        series_query_param = next(
> > > > +            filter(
> > > > +                lambda x: len(x) == 2 and x[0] == 'series',
> > > > +                map(lambda x: x.split('='), parse_result.query.split('&')),
> > > > +            )
> > > > +        )
> > > > +
> > > > +        if series_query_param:
> > >
> > > nit:
> > >
> > >     if not series_query_param:
> > >         return None
> > >
> > >     series_id = series_query_param[1]
> > >
> > >     ...
> > >
> > > > +            series_id = series_query_param[1]
> > > > +
> > > > +            try:
> > > > +                series_id_num = int(series_id)
> > > > +            except ValueError:
> > > > +                return None
> > > > +
> > > > +            return Series.objects.filter(id=series_id_num).first()
> > > > +    elif result.view_name == 'patch-detail':
> > > > +        msgid = Patch.decode_msgid(result.kwargs['msgid'])
> > > > +        patch = Patch.objects.filter(msgid=msgid).first()
> > > > +
> > > > +        if patch:
> > > > +            return patch.series
> > > > +
> > > > +
> > > > +def find_series_from_ref(match):
> > > > +    (obj_type, obj_id_str) = match
> > >
> > > nit: don't need those extra brackets
> > >
> > > > +
> > > > +    try:
> > > > +        object_id = int(obj_id_str)
> > > > +    except ValueError:
> > > > +        return None
> > > > +
> > > > +    if obj_type == 'series':
> > > > +        series = Series.objects.filter(id=object_id).first()
> > >
> > > nit:
> > >
> > >     return Series.objects.filter(...)
> > >
> > > > +    elif obj_type == 'patch':
> > > > +        patch = Patch.objects.filter(id=object_id).first()
> > > > +
> > > > +        if not patch:
> > > > +            return None
> > > > +
> > > > +        series = patch.series
> > >
> > > nit:
> > >
> > >     return patch.series
> > >
> > > > +
> > > > +    return series
> > > > +
> > > > +
> > > > +def parse_depends_on(content):
> > > > +    """Parses any dependency hints in the comments."""
> > > > +    depends_patterns = [
> > > > +        (
> > > > +            re.compile(
> > > > +                r'^Depends-on: ((?:patch)?(?:series)?)-([\d]+)(?: \("[^\n\r"]+"\))?\s*$',
> > > > +                re.MULTILINE | re.IGNORECASE,
> > > > +            ),
> > > > +            find_series_from_ref,
> > > > +        ),
> > > > +        (
> > > > +            re.compile(
> > > > +                r'^Depends-on: (http[s]?:\/\/[\w\d\-.\/=&@:%?_\+()]+)\s*$',
> > > > +                re.MULTILINE | re.IGNORECASE,
> > > > +            ),
> > > > +            find_series_from_url,
> > > > +        ),
> > > > +    ]
> > > > +
> > > > +    dependencies = list()
> > >
> > > nit: Can you use a literal?
> > >
> > >     dependencies = []
> > >
> > > > +
> > > > +    for pat, mapper in depends_patterns:
> > > > +        matches = pat.findall(content)
> > > > +
> > > > +        # Produce a list of tuples containing type and ID of each dependency.
> > > > +        # Eliminate elements where we could not parse the ID as an integer.
> > > > +        dependencies.extend(
> > > > +            filter(lambda series: series is not None, map(mapper, matches))
> > > > +        )
> > >
> > > This is a little too clever for my liking :)
> > >
> > >      for match in re.findall(
> > >          r'^Depends-on: ((?:patch)?(?:series)?)-([\d]+)(?:
> > > \("[^\n\r"]+"\))?\s*$',
> > >          re.MULTILINE | re.IGNORECASE,
> > >      ):
> > >          if series := find_series_from_ref(match):
> > >              dependencies.append(series)
> > >
> > >      ...
> >
> > I was looking to make this as easily extensible as possible, so that
> > if another contributor in the future wants to come along and add
> > another format to refer to a series/patch it can be added easily.
> > But we can unwind this into two loops, one for message ID and one for
> > URL parsing.
> >
> > > > +
> > > > +    # Return list of series objects to depend on.
> > > > +    return dependencies
> > > > +
> > > > +
> > > >  def find_state(mail):
> > > >      """Return the state with the given name or the default."""
> > > >      state_name = clean_header(mail.get('X-Patchwork-State', ''))
> > > > @@ -1171,6 +1269,12 @@ def parse_mail(mail, list_id=None):
> > > >
> > > >      pull_url = parse_pull_request(message)
> > > >
> > > > +    # Only look for "Depends-on" tags if the setting is enabled.
> > > > +    if settings.ENABLE_DEPENDS_ON_PARSING:
> > > > +        dependencies = parse_depends_on(message)
> > > > +    else:
> > > > +        dependencies = []
> > > > +
> > >
> > > The patch that adds this should come earlier in the series (before this one). We
> > > can discuss whether it's needed there. However, if we keep it can you move the
> > > check for it into 'parse_depends_on' and move the call to 'parse_depends_on'
> > > below so that we can avoid parsing dependencies for non-cover letter/patch
> > > emails?
> >
> > Makes sense to me.
> >
> > >
> > > >      # build objects
> > > >
> > > >      if not is_comment and (diff or pull_url):  # patches or pull requests
> > > > @@ -1308,6 +1412,8 @@ def parse_mail(mail, list_id=None):
> > > >              # always have a series
> > > >              series.add_patch(patch, x)
> > > >
> > > > +        series.add_dependencies(dependencies)
> > >
> > >     dependencies = parse_depends_on(message)
> > >     series.add_dependencies(dependencies)
> > >
> > > > +
> > > >          return patch
> > > >      elif x == 0:  # (potential) cover letters
> > > >          # if refs are empty, it's implicitly a cover letter. If not,
> > > > @@ -1374,6 +1480,7 @@ def parse_mail(mail, list_id=None):
> > > >              logger.debug('Cover letter saved')
> > > >
> > > >              series.add_cover_letter(cover_letter)
> > > > +            series.add_dependencies(dependencies)
> > >
> > >     dependencies = parse_depends_on(message)
> > >     series.add_dependencies(dependencies)
> > >
> > > >              return cover_letter
> > > >
> > >
>
diff mbox series

Patch

diff --git a/patchwork/parser.py b/patchwork/parser.py
index 09a53a0..90ec63b 100644
--- a/patchwork/parser.py
+++ b/patchwork/parser.py
@@ -14,11 +14,14 @@  from email.errors import HeaderParseError
 from fnmatch import fnmatch
 import logging
 import re
+from urllib.parse import urlparse
 
 from django.contrib.auth.models import User
 from django.db.utils import IntegrityError
 from django.db import transaction
 from django.utils import timezone as tz_utils
+from django.urls import resolve, Resolver404
+from django.conf import settings
 
 from patchwork.models import Cover
 from patchwork.models import CoverComment
@@ -32,7 +35,6 @@  from patchwork.models import Series
 from patchwork.models import SeriesReference
 from patchwork.models import State
 
-
 _msgid_re = re.compile(r'<[^>]+>')
 _hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
 _filename_re = re.compile(r'^(---|\+\+\+) (\S+)')
@@ -1054,6 +1056,102 @@  def parse_pull_request(content):
     return None
 
 
+def find_series_from_url(url):
+    """
+    Get series or patch from URL.
+    """
+
+    parse_result = urlparse(url)
+
+    # Resolve the URL path to see if this is a patch or series detail URL.
+    try:
+        result = resolve(parse_result.path)
+    except Resolver404:
+        return None
+
+    if result.view_name == 'patch-list' and parse_result.query:
+        # Parse the query string.
+        # This can be replaced with something much friendlier once the
+        # series detail view is implemented.
+        series_query_param = next(
+            filter(
+                lambda x: len(x) == 2 and x[0] == 'series',
+                map(lambda x: x.split('='), parse_result.query.split('&')),
+            )
+        )
+
+        if series_query_param:
+            series_id = series_query_param[1]
+
+            try:
+                series_id_num = int(series_id)
+            except ValueError:
+                return None
+
+            return Series.objects.filter(id=series_id_num).first()
+    elif result.view_name == 'patch-detail':
+        msgid = Patch.decode_msgid(result.kwargs['msgid'])
+        patch = Patch.objects.filter(msgid=msgid).first()
+
+        if patch:
+            return patch.series
+
+
+def find_series_from_ref(match):
+    (obj_type, obj_id_str) = match
+
+    try:
+        object_id = int(obj_id_str)
+    except ValueError:
+        return None
+
+    if obj_type == 'series':
+        series = Series.objects.filter(id=object_id).first()
+    elif obj_type == 'patch':
+        patch = Patch.objects.filter(id=object_id).first()
+
+        if not patch:
+            return None
+
+        series = patch.series
+
+    return series
+
+
+def parse_depends_on(content):
+    """Parses any dependency hints in the comments."""
+    depends_patterns = [
+        (
+            re.compile(
+                r'^Depends-on: ((?:patch)?(?:series)?)-([\d]+)(?: \("[^\n\r"]+"\))?\s*$',
+                re.MULTILINE | re.IGNORECASE,
+            ),
+            find_series_from_ref,
+        ),
+        (
+            re.compile(
+                r'^Depends-on: (http[s]?:\/\/[\w\d\-.\/=&@:%?_\+()]+)\s*$',
+                re.MULTILINE | re.IGNORECASE,
+            ),
+            find_series_from_url,
+        ),
+    ]
+
+    dependencies = list()
+
+    for pat, mapper in depends_patterns:
+        matches = pat.findall(content)
+
+        # Produce a list of tuples containing type and ID of each dependency.
+        # Eliminate elements where we could not parse the ID as an integer.
+        dependencies.extend(
+            filter(lambda series: series is not None, map(mapper, matches))
+        )
+
+    # Return list of series objects to depend on.
+    return dependencies
+
+
 def find_state(mail):
     """Return the state with the given name or the default."""
     state_name = clean_header(mail.get('X-Patchwork-State', ''))
@@ -1171,6 +1269,12 @@  def parse_mail(mail, list_id=None):
 
     pull_url = parse_pull_request(message)
 
+    # Only look for "Depends-on" tags if the setting is enabled.
+    if settings.ENABLE_DEPENDS_ON_PARSING:
+        dependencies = parse_depends_on(message)
+    else:
+        dependencies = []
+
     # build objects
 
     if not is_comment and (diff or pull_url):  # patches or pull requests
@@ -1308,6 +1412,8 @@  def parse_mail(mail, list_id=None):
             # always have a series
             series.add_patch(patch, x)
 
+        series.add_dependencies(dependencies)
+
         return patch
     elif x == 0:  # (potential) cover letters
         # if refs are empty, it's implicitly a cover letter. If not,
@@ -1374,6 +1480,7 @@  def parse_mail(mail, list_id=None):
             logger.debug('Cover letter saved')
 
             series.add_cover_letter(cover_letter)
+            series.add_dependencies(dependencies)
 
             return cover_letter