From f56596c3fe835c540f14da0f145f45d1d7bbb467 Mon Sep 17 00:00:00 2001 From: msglm Date: Thu, 23 May 2024 05:15:46 -0500 Subject: patch update for bugwarrior --- aux-files/python-bugwarrior/gitea-support.patch | 584 +++++++++++++++++++++ .../snscrape/snscrape-downloads-telegram.patch | 495 +++++++++++++++++ 2 files changed, 1079 insertions(+) create mode 100644 aux-files/snscrape/snscrape-downloads-telegram.patch (limited to 'aux-files') diff --git a/aux-files/python-bugwarrior/gitea-support.patch b/aux-files/python-bugwarrior/gitea-support.patch index 4ec4a9b..6c2c331 100644 --- a/aux-files/python-bugwarrior/gitea-support.patch +++ b/aux-files/python-bugwarrior/gitea-support.patch @@ -840,3 +840,587 @@ index a4c174c8..445846ff 100644 + log.critical('[%s] has no \'gitea.token\' or \'gitea.password\'' % target) + sys.exit(1) + +From 81b3fa0b47db93fb83a54c6727f5ee3c408797c5 Mon Sep 17 00:00:00 2001 +From: msglm +Date: Wed, 22 May 2024 20:59:54 -0500 +Subject: [PATCH 3/4] Add basic documentation + +mish-mash between the github and gitlab documentation with everything I +don't think is supported removed. +--- + bugwarrior/docs/services/gitea.rst | 124 +++++++++++++++++++++++++++++ + 1 file changed, 124 insertions(+) + create mode 100644 bugwarrior/docs/services/gitea.rst + +diff --git a/bugwarrior/docs/services/gitea.rst b/bugwarrior/docs/services/gitea.rst +new file mode 100644 +index 00000000..6ccf2308 +--- /dev/null ++++ b/bugwarrior/docs/services/gitea.rst +@@ -0,0 +1,124 @@ ++Gitea ++====== ++ ++You can import tasks from your Gitea instance using ++the ``gitea`` service name. ++ ++Example Service ++--------------- ++ ++Here's an example of a Gitea target: ++ ++.. config:: ++ ++ [user_gitea] ++ service = gitea ++ gitea.login = ralphbean ++ gitea.username = ralphbean ++ gitea.host = git.bean.com #Note: the lack of https, the service will assume HTTPS by default. ++ gitea.password = @oracle:eval:pass show 'git.bean.com' ++ gitea.token = 0000000000000000000000000000000 ++ ++The above example is the minimum required to import issues from ++Gitea. You can also feel free to use any of the ++configuration options described in :ref:`common_configuration_options` ++or described in `Service Features`_ below. ++ ++The ``token`` is your private API token. ++ ++Service Features ++---------------- ++ ++Include and Exclude Certain Repositories ++++++++++++++++++++++++++++++++++++++++++ ++ ++If you happen to be working with a large number of projects, you ++may want to pull issues from only a subset of your repositories. To ++do that, you can use the ``include_repos`` option. ++ ++For example, if you would like to only pull-in issues from ++your own ``project_foo`` and team ``bar``'s ``project_fox`` repositories, you ++could add this line to your service configuration (replacing ``me`` by your own ++login): ++ ++.. config:: ++ :fragment: gitea ++ ++ gitea.include_repos = me/project_foo, bar/project_fox ++ ++Alternatively, if you have a particularly noisy repository, you can ++instead choose to import all issues excepting it using the ++``exclude_repos`` configuration option. ++ ++In this example, ``noisy/repository`` is the repository you would ++*not* like issues created for: ++ ++.. config:: ++ :fragment: gitea ++ ++ gitea.exclude_repos = noisy/repository ++ ++.. hint:: ++ If you omit the repository's namespace, bugwarrior will automatically add ++ your login as namespace. E.g. the following are equivalent: ++ ++.. config:: ++ :fragment: gitea ++ ++ gitea.login = foo ++ gitea.include_repos = bar ++ ++and: ++ ++.. config:: ++ :fragment: gitea ++ ++ gitea.login = foo ++ gitea.include_repos = foo/bar ++ ++Alternatively, you can use project IDs instead of names by prefixing the ++project id with `id:`: ++ ++.. config:: ++ :fragment: gitea ++ ++ gitea.include_repos = id:1234,id:3141 ++ ++Import Labels as Tags +++++++++++++++++++++++ ++ ++The gitea issue tracker allows you to attach labels to issues; to ++use those labels as tags, you can use the ``import_labels_as_tags`` ++option: ++ ++.. config:: ++ :fragment: gitea ++ ++ gitea.import_labels_as_tags = True ++ ++Also, if you would like to control how these labels are created, you can ++specify a template used for converting the gitea label into a Taskwarrior ++tag. ++ ++For example, to prefix all incoming labels with the string 'gitea_' (perhaps ++to differentiate them from any existing tags you might have), you could ++add the following configuration option: ++ ++.. config:: ++ :fragment: gitea ++ ++ gitea.label_template = gitea_{{label}} ++ ++In addition to the context variable ``{{label}}``, you also have access ++to all fields on the Taskwarrior task if needed: ++ ++.. note:: ++ ++ See :ref:`field_templates` for more details regarding how templates ++ are processed. ++ ++ ++Provided UDA Fields ++------------------- ++ ++.. udas:: bugwarrior.services.gitea.GiteaIssue + +From 1626a36c15013fc42e369cdc9ef98c23e10845c2 Mon Sep 17 00:00:00 2001 +From: msglm +Date: Wed, 22 May 2024 21:00:20 -0500 +Subject: [PATCH 4/4] Remove Six usage and clean the codebase + +Suggestions from here are implemented +https://github.com/GothenburgBitFactory/bugwarrior/pull/1048#pullrequestreview-2070021239 +--- + bugwarrior/services/gitea.py | 20 ++++---------------- + 1 file changed, 4 insertions(+), 16 deletions(-) + +diff --git a/bugwarrior/services/gitea.py b/bugwarrior/services/gitea.py +index 445846ff..341ec617 100644 +--- a/bugwarrior/services/gitea.py ++++ b/bugwarrior/services/gitea.py +@@ -15,7 +15,6 @@ + import logging + import pathlib + import re +-import six + import sys + from urllib.parse import urlparse + from urllib.parse import quote_plus +@@ -36,11 +35,10 @@ class GiteaConfig(config.ServiceConfig): + host = "gitea.com" + login: str + token: str +- login: str + username: str + password: str +- exclude_repos = [] +- include_repos = [] ++ exclude_repos = config.ConfigList([]) ++ include_repos = config.ConfigList([]) + + def get(self, key, default=None, to_type=None): + try: +@@ -370,7 +368,7 @@ def __init__(self, *args, **kw): + 'import_labels_as_tags', default=False, to_type=bool + ) + self.label_template = self.config.get( +- 'label_template', default='{{label}}', to_type=six.text_type ++ 'label_template', default='{{label}}', to_type=bool + ) + self.project_owner_prefix = self.config.get( + 'project_owner_prefix', default=False, to_type=bool +@@ -380,7 +378,7 @@ def __init__(self, *args, **kw): + 'query', + default='involves:{user} state:open'.format( + user=self.username) if self.involved_issues else '', +- to_type=six.text_type ++ to_type=str + ) + + @staticmethod +@@ -552,13 +550,3 @@ def issues(self): + issue_obj.update_extra(extra) + yield issue_obj + +- @classmethod +- def validate_config(cls, service_config, target): +- if 'login' not in service_config: +- log.critical('[%s] has no \'gitea.login\'' % target) +- sys.exit(1) +- +- if 'token' not in service_config and 'password' not in service_config: +- log.critical('[%s] has no \'gitea.token\' or \'gitea.password\'' % target) +- sys.exit(1) +- + +From 17b725774281e9742b786dbcbcf791f7f3dacf61 Mon Sep 17 00:00:00 2001 +From: msglm +Date: Thu, 23 May 2024 05:03:39 -0500 +Subject: [PATCH 5/6] Intake Critique and simplify + +Remove user+pass auth, token only now. +Added issue API Querying for writing custom queries +Added include_assigned,created,mentioned, and review_requested issues +config settings +Added ability to limit the number of issues you will query (Gitea limits +the API by default to 50, but I host my own instance so I raised it) +get_tags simplified greatly +--- + bugwarrior/services/gitea.py | 178 ++++++++++++++++++++--------------- + 1 file changed, 102 insertions(+), 76 deletions(-) + +diff --git a/bugwarrior/services/gitea.py b/bugwarrior/services/gitea.py +index 341ec617..28a92e96 100644 +--- a/bugwarrior/services/gitea.py ++++ b/bugwarrior/services/gitea.py +@@ -29,16 +29,28 @@ + + log = logging.getLogger(__name__) # pylint: disable-msg=C0103 + ++#TODO: Document this with docstrings + class GiteaConfig(config.ServiceConfig): + service: typing_extensions.Literal['gitea'] +- + host = "gitea.com" +- login: str + token: str + username: str +- password: str +- exclude_repos = config.ConfigList([]) +- include_repos = config.ConfigList([]) ++ include_assigned_issues: bool = False ++ include_created_issues: bool = False ++ include_mentioned_issues: bool = False ++ include_review_requested_issues: bool = False ++ import_labels_as_tags: bool = True ++ involved_issues: bool = False ++ project_owner_prefix: bool = False ++ include_repos: config.ConfigList = config.ConfigList([]) ++ exclude_repos: config.ConfigList = config.ConfigList([]) ++ label_template = str = '{{label}}' ++ filter_pull_requests: bool = False ++ exclude_pull_requests: bool = False ++ """ ++ The maximum number of issues the API may get from the host ++ """ ++ issue_limit: int = 100 + + def get(self, key, default=None, to_type=None): + try: +@@ -65,7 +77,7 @@ class GiteaClient(ServiceClient): + - get_repos: + - get_query: + - get_issues: +- - get_directly_assigned_issues: ++ - get_special_issues: + - get_comments: + - get_pulls: + """ +@@ -79,16 +91,8 @@ def __init__(self, host, auth): + + def _api_url(self, path, **context): + """ Build the full url to the API endpoint """ +- # TODO add token support +- if 'basic' in self.auth: +- (username, password) = self.auth['basic'] +- baseurl = 'https://{user}:{secret}@{host}/api/v1'.format( +- host=self.host, +- user=username, +- secret=quote_plus(password)) +- if 'token' in self.auth: +- baseurl = 'https://{host}/api/v1'.format( +- host=self.host) ++ baseurl = 'https://{host}/api/v1'.format( ++ host=self.host) + return baseurl + path.format(**context) + + # TODO Modify these for gitea support +@@ -109,17 +113,17 @@ def get_issues(self, username, repo): + '/repos/{username}/{repo}/issues?per_page=100', + username=username, repo=repo) + return self._getter(url) ++ ++ def get_special_issues(self, username, query: str): ++ """ Returns all issues assigned to authenticated user given a specific query. + +- def get_directly_assigned_issues(self, username): +- """ Returns all issues assigned to authenticated user. +- +- This will return all issues assigned to the authenticated user +- regardless of whether the user owns the repositories in which the +- issues exist. ++ This will return all issues this authenticated user has access to and then ++ filter the issues with the query that the user supplied. + """ +- url = self._api_url('/repos/issues/search', +- username=username, assignee=True) +- return self._getter(url, passedParams={'assigned': True, 'limit': 100}) #TODO: make the limit configurable ++ logging.info("Querying /repos/issues/search with query: " + query) ++ url = self._api_url('/repos/issues/search?{query}', ++ username=username, query=query) ++ return self._getter(url) + + # TODO close to gitea format: /comments/{id} + def get_comments(self, username, repo, number): +@@ -134,7 +138,7 @@ def get_pulls(self, username, repo): + username=username, repo=repo) + return self._getter(url) + +- def _getter(self, url, subkey=None, passedParams={}): ++ def _getter(self, url, subkey=None): + """ Pagination utility. Obnoxious. """ + + kwargs = {} +@@ -145,7 +149,7 @@ def _getter(self, url, subkey=None, passedParams={}): + link = dict(next=url) + + while 'next' in link: +- response = self.session.get(link['next'], params=passedParams, **kwargs) ++ response = self.session.get(link['next'], **kwargs) + + # Warn about the mis-leading 404 error code. See: + # https://gitea.com/ralphbean/bugwarrior/issues/374 +@@ -269,6 +273,9 @@ def to_taskwarrior(self): + if body: + body = body.replace('\r\n', '\n') + ++ if len(body) < 1: ++ body = "No annotation was provided." ++ + created = self.parse_date(self.record.get('created_at')) + updated = self.parse_date(self.record.get('updated_at')) + closed = self.parse_date(self.record.get('closed_at')) +@@ -295,25 +302,19 @@ def to_taskwarrior(self): + self.NAMESPACE: self.extra['namespace'], + self.STATE: self.record.get('state', '') + } +- + def get_tags(self): +- tags = [] +- +- if not self.config.get('import_labels_as_tags'): +- return tags ++ labels = [label['name'] for label in self.record.get('labels', [])] ++ return self.get_tags_from_labels(labels) + +- context = self.record.copy() +- label_template = Template(self.config.get('label_template')) +- +- for label_dict in self.record.get('labels', []): +- context.update({ +- 'label': self._normalize_label_to_tag(label_dict['name']) +- }) +- tags.append( +- label_template.render(context) +- ) ++ def get_default_description(self): ++ log.info('In get_default_description') ++ return self.build_default_description( ++ title=self.record['title'], ++ url=self.get_processed_url(self.record['url']), ++ number=self.record['number'], ++ cls=self.extra['type'], ++ ) + +- return tags + + def get_default_description(self): + log.info('In get_default_description') +@@ -335,44 +336,42 @@ def __init__(self, *args, **kw): + + auth = {} + token = self.config.token +- self.login = self.config.login + if hasattr(self.config, 'token'): +- token = self.get_password('token', login=self.login) ++ token = self.get_password('token', login=self.config.username) + auth['token'] = token +- elif hasattr(self.config.hasattr, 'password'): +- password = self.get_password('password', login=self.login) +- auth['basic'] = (self.login, password) + else: + #Probably should be called by validate_config, but I don't care to fix that. +- logging.critical("ERROR! Neither token or password was provided in config!") ++ logging.critical("ERROR! No token was provided in config!") + sys.exit(1) + ++ #TODO: document these with docstrings + self.client = GiteaClient(host=self.config.host, auth=auth) + + self.host = self.config.host + + self.exclude_repos = self.config.exclude_repos ++ + self.include_repos = self.config.include_repos + + self.username = self.config.username +- self.filter_pull_requests = self.config.get( +- 'filter_pull_requests', default=False, to_type=bool +- ) +- self.exclude_pull_requests = self.config.get( +- 'exclude_pull_requests', default=False, to_type=bool +- ) +- self.involved_issues = self.config.get( +- 'involved_issues', default=False, to_type=bool +- ) +- self.import_labels_as_tags = self.config.get( +- 'import_labels_as_tags', default=False, to_type=bool +- ) +- self.label_template = self.config.get( +- 'label_template', default='{{label}}', to_type=bool +- ) +- self.project_owner_prefix = self.config.get( +- 'project_owner_prefix', default=False, to_type=bool +- ) ++ ++ self.filter_pull_requests = self.config.filter_pull_requests ++ ++ self.exclude_pull_requests = self.config.exclude_pull_requests ++ ++ self.involved_issues = self.config.involved_issues ++ ++ self.project_owner_prefix = self.config.project_owner_prefix ++ ++ self.include_assigned_issues = self.config.include_assigned_issues ++ ++ self.include_created_issues = self.config.include_created_issues ++ ++ self.include_review_requested_issues = self.config.include_review_requested_issues ++ ++ self.import_labels_as_tags = self.config.import_labels_as_tags ++ ++ self.label_template = self.config.label_template + + self.query = self.config.get( + 'query', +@@ -384,11 +383,10 @@ def __init__(self, *args, **kw): + @staticmethod + def get_keyring_service(service_config): + #TODO grok this +- login = service_config.login + username = service_config.username + host = service_config.host +- return 'gitea://{login}@{host}/{username}'.format( +- login=login, username=username, host=host) ++ return 'gitea://{username}@{host}/{username}'.format( ++ username=username, host=host) + + def get_service_metadata(self): + return { +@@ -417,9 +415,9 @@ def get_query(self, query): + issues[url] = (repo, issue) + return issues + +- def get_directly_assigned_issues(self, username): ++ def get_special_issues(self, username, query): + issues = {} +- for issue in self.client.get_directly_assigned_issues(self.username): ++ for issue in self.client.get_special_issues(self.username, query): + repos = self.get_repository_from_issue(issue) + issues[issue['url']] = (repos, issue) + return issues +@@ -524,10 +522,38 @@ def issues(self): + self.get_owned_repo_issues( + self.username + '/' + repo) + ) +- issues.update( +- filter(self.filter_issues, +- self.get_directly_assigned_issues(self.username).items()) +- ) ++ ++ ''' ++ A variable used to represent the attachable HTTP query that can be attached to the /repos/issues/search API end. ++ ++ if httpQuery is set to "review_requested=True?mentioned=True" for example, then the /repos/issues/search API end will be told to search for all issues where a review is requested AND where the user is mentioned. ++ ''' ++ httpQuery = "limit=" + str(self.config.issue_limit) + "&" ++ ++ if self.config.get('include_assigned_issues', True, bool): ++ log.info("assigned was true") ++ issues.update( ++ filter(self.filter_issues, ++ self.get_special_issues(self.username, httpQuery + "assigned=true&").items()) ++ ) ++ if self.config.get('include_created_issues', True, bool): ++ log.info("created was true") ++ issues.update( ++ filter(self.filter_issues, ++ self.get_special_issues(self.username, httpQuery + "created=true&").items()) ++ ) ++ if self.config.get('include_mentioned_issues', True, bool): ++ log.info("mentioned was true") ++ issues.update( ++ filter(self.filter_issues, ++ self.get_special_issues(self.username, httpQuery + "mentioned=true&").items()) ++ ) ++ if self.config.get('include_review_requested_issues', True, bool): ++ log.info("review request was true") ++ issues.update( ++ filter(self.filter_issues, ++ self.get_special_issues(self.username, httpQuery + "review_requested=true&").items()) ++ ) + + log.info(' Found %i issues.', len(issues)) # these were debug logs + issues = list(filter(self.include, issues.values())) + +From 3eb6e743c7ee4c7892525c05d880f5d05d3f8600 Mon Sep 17 00:00:00 2001 +From: msglm +Date: Thu, 23 May 2024 05:13:33 -0500 +Subject: [PATCH 6/6] Documentation for previous commit + +--- + bugwarrior/docs/services/gitea.rst | 31 +++++++++++++++++++++++++++--- + 1 file changed, 28 insertions(+), 3 deletions(-) + +diff --git a/bugwarrior/docs/services/gitea.rst b/bugwarrior/docs/services/gitea.rst +index 6ccf2308..19e0930a 100644 +--- a/bugwarrior/docs/services/gitea.rst ++++ b/bugwarrior/docs/services/gitea.rst +@@ -13,11 +13,9 @@ Here's an example of a Gitea target: + + [user_gitea] + service = gitea +- gitea.login = ralphbean + gitea.username = ralphbean + gitea.host = git.bean.com #Note: the lack of https, the service will assume HTTPS by default. +- gitea.password = @oracle:eval:pass show 'git.bean.com' +- gitea.token = 0000000000000000000000000000000 ++ gitea.token = @oracle:eval:pass show 'git.bean.com token' + + The above example is the minimum required to import issues from + Gitea. You can also feel free to use any of the +@@ -117,6 +115,33 @@ to all fields on the Taskwarrior task if needed: + See :ref:`field_templates` for more details regarding how templates + are processed. + ++Limit Issues Imported +++++++++++++++++++++++ ++Gitea lets system administrators configure the amount of objects that any given API request will return. ++You may configure the amount to tell Gitea to give to you using the ``issue_limit`` option: ++ ++.. config:: ++ :fragment: gitea ++ ++ gitea.issue_limit = 200 ++ ++Do note, this will not overwrite what the gitea instance limits you to, it merely lets you set the amount of issues you will import. ++ ++ ++Including various types of issues +++++++++++++++++++++++ ++ ++Gitea has metadata attached to each issue, primarily: If you are assigned to an issue, if you created an issue, if an issue mentions you, and if an issue has a review reqest for you. You may set if each of these traits is worth importing by using the various ``include_*_issues`` options: ++ ++.. config:: ++ :fragment: gitea ++ ++ gitea.include_assigned_issues = true ++ gitea.include_created_issues = true ++ gitea.include_mentioned_issues = true ++ gitea.include_review_requested_issues = true ++ ++Each setting will query the API for that trait alone and then add it to your Taskwarrior task list. For example, if you have created issues and mentioned issues off, but assigned issues and review requested issues on: You will only recieve new tasks for the issues you are assigned to do or requested to review, but not for issues you've created or mentioned. Issues that have been assigned to you and created by you would be included though, as these settings merely mark inclusion, not exclusion. + + Provided UDA Fields + ------------------- diff --git a/aux-files/snscrape/snscrape-downloads-telegram.patch b/aux-files/snscrape/snscrape-downloads-telegram.patch new file mode 100644 index 0000000..46665c4 --- /dev/null +++ b/aux-files/snscrape/snscrape-downloads-telegram.patch @@ -0,0 +1,495 @@ +From 00239388e3096277a55271a8786b4b5d6d2bec84 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Thu, 18 Jan 2024 11:37:32 -0500 +Subject: [PATCH 1/8] WIP: Fixed 2.5 out of 5 issues mentioned in PR + +--- + snscrape/base.py | 1 + + snscrape/modules/telegram.py | 12 ++++++------ + 2 files changed, 7 insertions(+), 6 deletions(-) + +diff --git a/snscrape/base.py b/snscrape/base.py +index c9e75d9d..5ce5e1da 100644 +--- a/snscrape/base.py ++++ b/snscrape/base.py +@@ -193,6 +193,7 @@ def _request(self, method, url, params = None, data = None, headers = None, time + # The request is newly prepared on each retry because of potential cookie updates. + req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers)) + environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None) ++ _logger.info("Hey there, I'm in here") + _logger.info(f'Retrieving {req.url}') + _logger.debug(f'... with headers: {headers!r}') + if data: +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 4e977656..54345d96 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -196,6 +196,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + } + timeTag = videoPlayer.find('time') + if timeTag is None: ++ _logger.warning(f'Could not find duration for video or GIF at {url}') + cls = Gif + else: + cls = Video +@@ -219,8 +220,6 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + else: + _logger.warning(f'Could not process link preview image on {url}') + linkPreview = LinkPreview(**kwargs) +- if kwargs['href'] in outlinks: +- outlinks.remove(kwargs['href']) + + viewsSpan = post.find('span', class_ = 'tgme_widget_message_views') + views = None if viewsSpan is None else _parse_num(viewsSpan.text) +@@ -239,13 +238,14 @@ def get_items(self): + return + nextPageUrl = '' + while True: ++ print("About to yield from get_items") + yield from self._soup_to_items(soup, r.url) +- try: +- if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1': ++ dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True) ++ if dateElt and 'href' in dateElt.attrs: ++ urlPieces = dateElt['href'].split('/') ++ if urlPieces and urlPieces[-1] == '1': + # if message 1 is the first message in the page, terminate scraping + break +- except: +- pass + pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True}) + if not pageLink: + # some pages are missing a "tme_messages_more" tag, causing early termination + +From 670905fedb64656b94c6fb920c8628d318171b64 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Thu, 18 Jan 2024 11:46:46 -0500 +Subject: [PATCH 2/8] Remove test log statement, add link to example GIF + +--- + snscrape/base.py | 1 - + snscrape/modules/telegram.py | 2 +- + 2 files changed, 1 insertion(+), 2 deletions(-) + +diff --git a/snscrape/base.py b/snscrape/base.py +index 5ce5e1da..c9e75d9d 100644 +--- a/snscrape/base.py ++++ b/snscrape/base.py +@@ -193,7 +193,6 @@ def _request(self, method, url, params = None, data = None, headers = None, time + # The request is newly prepared on each retry because of potential cookie updates. + req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers)) + environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None) +- _logger.info("Hey there, I'm in here") + _logger.info(f'Retrieving {req.url}') + _logger.debug(f'... with headers: {headers!r}') + if data: +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 54345d96..01e99318 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -196,7 +196,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + } + timeTag = videoPlayer.find('time') + if timeTag is None: +- _logger.warning(f'Could not find duration for video or GIF at {url}') ++ # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3 + cls = Gif + else: + cls = Video + +From 54df8832f5b5bc3af58c3faf953966a2070a834d Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Thu, 22 Feb 2024 01:06:04 -0500 +Subject: [PATCH 3/8] Added media processing into main link loop; using prev + tag to get page, rather than index math + +--- + snscrape/modules/telegram.py | 84 +++++++++++++++++++----------------- + 1 file changed, 44 insertions(+), 40 deletions(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 01e99318..b4f3d78e 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -152,7 +152,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style) + if len(imageUrls) == 1: + media.append(Photo(url = imageUrls[0])) +- continue ++ + if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']): + style = link.attrs.get('style', '') + imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style) +@@ -161,49 +161,23 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + # resp = self._get(image[0]) + # encoded_string = base64.b64encode(resp.content) + # Individual photo or video link +- continue ++ + if link.text.startswith('@'): + mentions.append(link.text.strip('@')) +- continue ++ + if link.text.startswith('#'): + hashtags.append(link.text.strip('#')) +- continue ++ ++ if 'tgme_widget_message_voice_player' in link.get('class', []): ++ media.append(_parse_voice_message(link)) ++ ++ if 'tgme_widget_message_video_player' in link.get('class', []): ++ media.append(_parse_video_message(link)) ++ + href = urllib.parse.urljoin(pageUrl, link['href']) + if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl): + outlinks.append(href) + +- for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}): +- audioUrl = voicePlayer.find('audio')['src'] +- durationStr = voicePlayer.find('time').text +- duration = _durationStrToSeconds(durationStr) +- barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')] +- +- media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights)) +- +- for videoPlayer in post.find_all('a', {'class': 'tgme_widget_message_video_player'}): +- iTag = videoPlayer.find('i') +- if iTag is None: +- videoUrl = None +- videoThumbnailUrl = None +- else: +- style = iTag['style'] +- videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0] +- videoTag = videoPlayer.find('video') +- videoUrl = None if videoTag is None else videoTag['src'] +- mKwargs = { +- 'thumbnailUrl': videoThumbnailUrl, +- 'url': videoUrl, +- } +- timeTag = videoPlayer.find('time') +- if timeTag is None: +- # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3 +- cls = Gif +- else: +- cls = Video +- durationStr = videoPlayer.find('time').text +- mKwargs['duration'] = _durationStrToSeconds(durationStr) +- media.append(cls(**mKwargs)) +- + linkPreview = None + if (linkPreviewA := post.find('a', class_ = 'tgme_widget_message_link_preview')): + kwargs = {} +@@ -250,10 +224,10 @@ def get_items(self): + if not pageLink: + # some pages are missing a "tme_messages_more" tag, causing early termination + if '=' not in nextPageUrl: +- nextPageUrl = soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href'] +- nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20 ++ nextPageUrl = soup.find('link', attrs = {'rel': 'prev'}, href = True)['href'] ++ nextPostIndex = int(nextPageUrl.split('=')[-1]) + if nextPostIndex > 20: +- pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'} ++ pageLink = {'href': nextPageUrl} + else: + break + nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href']) +@@ -333,4 +307,34 @@ def _telegramResponseOkCallback(r): + if r.status_code == 200: + return (True, None) + return (False, f'{r.status_code=}') +- +\ No newline at end of file ++ ++def _parse_voice_message(voicePlayer): ++ audioUrl = voicePlayer.find('audio')['src'] ++ durationStr = voicePlayer.find('time').text ++ duration = _durationStrToSeconds(durationStr) ++ barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')] ++ return VoiceMessage(url = audioUrl, duration = duration, bars = barHeights) ++ ++def _parse_video_message(videoPlayer): ++ iTag = videoPlayer.find('i') ++ if iTag is None: ++ videoUrl = None ++ videoThumbnailUrl = None ++ else: ++ style = iTag['style'] ++ videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0] ++ videoTag = videoPlayer.find('video') ++ videoUrl = None if videoTag is None else videoTag['src'] ++ mKwargs = { ++ 'thumbnailUrl': videoThumbnailUrl, ++ 'url': videoUrl, ++ } ++ timeTag = videoPlayer.find('time') ++ if timeTag is None: ++ # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3 ++ cls = Gif ++ else: ++ cls = Video ++ durationStr = videoPlayer.find('time').text ++ mKwargs['duration'] = _durationStrToSeconds(durationStr) ++ return cls(**mKwargs) +\ No newline at end of file + +From 2dfd1542f19bbadad603e00e61712943542fbfe1 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Thu, 22 Feb 2024 01:07:46 -0500 +Subject: [PATCH 4/8] Forgot to remove a test log + +--- + snscrape/modules/telegram.py | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index b4f3d78e..8f6d18d7 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -212,7 +212,6 @@ def get_items(self): + return + nextPageUrl = '' + while True: +- print("About to yield from get_items") + yield from self._soup_to_items(soup, r.url) + dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True) + if dateElt and 'href' in dateElt.attrs: + +From a93f6a3fad0d19209a49c7b730fea73659743774 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Fri, 1 Mar 2024 12:51:26 -0500 +Subject: [PATCH 5/8] Applying trislee's suggested fix for getting nextPageUrl + +--- + snscrape/modules/telegram.py | 16 +++++----------- + 1 file changed, 5 insertions(+), 11 deletions(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 8f6d18d7..ac0feef8 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -219,17 +219,11 @@ def get_items(self): + if urlPieces and urlPieces[-1] == '1': + # if message 1 is the first message in the page, terminate scraping + break +- pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True}) +- if not pageLink: +- # some pages are missing a "tme_messages_more" tag, causing early termination +- if '=' not in nextPageUrl: +- nextPageUrl = soup.find('link', attrs = {'rel': 'prev'}, href = True)['href'] +- nextPostIndex = int(nextPageUrl.split('=')[-1]) +- if nextPostIndex > 20: +- pageLink = {'href': nextPageUrl} +- else: +- break +- nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href']) ++ if pageLink := soup.find('link', attrs = {'rel': 'prev'}, href = True): ++ nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href']) ++ else: ++ nextPostIndex = int(soup.find('div', attrs = {'class': 'tgme_widget_message', 'data-post': True})["data-post"].split("/")[-1]) ++ nextPageUrl = urllib.parse.urljoin(r.url, r.url.split('?')[0] + f'?before={nextPostIndex}') + r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback) + if r.status_code != 200: + raise snscrape.base.ScraperException(f'Got status code {r.status_code}') + +From a542aa57598f94f69fd7b69789e97045e92133da Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Thu, 14 Mar 2024 01:50:38 -0400 +Subject: [PATCH 6/8] Ensured termination on channels w/o an id=1 post, wrote + test cases to prevent regression + +--- + snscrape/modules/telegram.py | 87 +++++++++++++++++++++++++++++++++++- + 1 file changed, 86 insertions(+), 1 deletion(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index ac0feef8..7a85cb58 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -9,6 +9,8 @@ + import snscrape.base + import typing + import urllib.parse ++import unittest ++import threading + + _logger = logging.getLogger(__name__) + _SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$') +@@ -212,6 +214,8 @@ def get_items(self): + return + nextPageUrl = '' + while True: ++ if soup.find("div", class_ = "tme_no_messages_found"): ++ break + yield from self._soup_to_items(soup, r.url) + dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True) + if dateElt and 'href' in dateElt.attrs: +@@ -330,4 +334,85 @@ def _parse_video_message(videoPlayer): + cls = Video + durationStr = videoPlayer.find('time').text + mKwargs['duration'] = _durationStrToSeconds(durationStr) +- return cls(**mKwargs) +\ No newline at end of file ++ return cls(**mKwargs) ++ ++class TestTelegramChannelScraper(unittest.TestCase): ++ ++ @staticmethod ++ def execute_with_timeout(func, timeout=10): ++ """ ++ Executes a function in a separate thread and enforces a timeout. ++ If provided function throws an error, it's re-raised in main thread. ++ Used to detect infinite loops in finite time, works cross-platform. ++ ++ :param func: The function to execute. This function should accept no arguments. ++ :param timeout: The timeout in seconds. ++ """ ++ exceptions=[] ++ def func_passing_exceptions(): ++ try: ++ func() ++ except Exception as e: ++ exceptions.append((e.__class__, e, e.__traceback__)) ++ ++ thread = threading.Thread(target=func_passing_exceptions) ++ thread.start() ++ thread.join(timeout=timeout) ++ ++ if exceptions: ++ exc_class, exc_instance, traceback = exceptions[0] ++ raise exc_class(exc_instance).with_traceback(traceback) ++ ++ if thread.is_alive(): ++ raise TimeoutError(f"Function didn't complete within {timeout} seconds") ++ ++ def test_scraping_termination_missing_prev(self): ++ """Test scraping always terminates, even if the page's prev link is missing.""" ++ ++ def scrape_two_pages(): ++ scraper = TelegramChannelScraper('WLM_USA_TEXAS?before=3766') ++ items = list() ++ num_items_on_page = 20 ++ for item in scraper.get_items(): ++ items.append(item) ++ if len(items) > 2 * num_items_on_page: ++ break ++ ++ self.execute_with_timeout(scrape_two_pages) ++ ++ def test_scraping_termination_small_post_count(self): ++ """Test scraping always terminates, even with small number of posts. This channel has only 28.""" ++ ++ def scrape_small_channel(): ++ scraper = TelegramChannelScraper('AKCPB') ++ items = list(scraper.get_items()) ++ return items ++ ++ self.execute_with_timeout(scrape_small_channel) ++ ++ def test_scraping_termination_channels_without_post_id_one(self): ++ """Test scraping gracefully handles channels missing a post where id=1.""" ++ ++ def scrape_empty_page(): ++ scraper = TelegramChannelScraper('BREAKDCODE?before=3') ++ for _ in scraper.get_items(): ++ pass ++ ++ self.execute_with_timeout(scrape_empty_page) ++ ++ def test_media_order_preservation(self): ++ """Test scraped media appears in the same order as in the post.""" ++ scraper = TelegramChannelScraper('nexta_live?before=43103') ++ item = next(scraper.get_items(), None) ++ self.assertIsNotNone(item, "Failed to scrape any posts.") ++ self.assertEqual(item.url, "https://t.me/s/nexta_live/43102") ++ ++ # Directly validate the types of the objects in the media array ++ expected_types = [Video, Photo, Video] # Adjust based on expected types ++ actual_types = [type(media) for media in item.media] if item.media else [] ++ ++ self.assertEqual(actual_types, expected_types, "Media did not appear in the expected order.") ++ ++ ++if __name__ == '__main__': ++ unittest.main() + +From 7d061cb5279e153f829340f848bc4ba01d716f26 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Thu, 14 Mar 2024 01:55:16 -0400 +Subject: [PATCH 7/8] Add docstring saying suite should run by directly running + file + +--- + snscrape/modules/telegram.py | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 7a85cb58..c6e0b0ee 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -336,7 +336,9 @@ def _parse_video_message(videoPlayer): + mKwargs['duration'] = _durationStrToSeconds(durationStr) + return cls(**mKwargs) + ++ + class TestTelegramChannelScraper(unittest.TestCase): ++ """Run suite by directly calling this file.""" + + @staticmethod + def execute_with_timeout(func, timeout=10): + +From 9309b1b01c6db15862809623e2c5adddecd894be Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Thu, 14 Mar 2024 02:00:50 -0400 +Subject: [PATCH 8/8] Correct some inaccurate test descriptions + +--- + snscrape/modules/telegram.py | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index c6e0b0ee..dbf0f9b3 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -338,7 +338,7 @@ def _parse_video_message(videoPlayer): + + + class TestTelegramChannelScraper(unittest.TestCase): +- """Run suite by directly calling this file.""" ++ """Run suite by directly running this file.""" + + @staticmethod + def execute_with_timeout(func, timeout=10): +@@ -383,7 +383,7 @@ def scrape_two_pages(): + self.execute_with_timeout(scrape_two_pages) + + def test_scraping_termination_small_post_count(self): +- """Test scraping always terminates, even with small number of posts. This channel has only 28.""" ++ """Test scraping always terminates, even with small number of posts. This channel's highest ID is 28.""" + + def scrape_small_channel(): + scraper = TelegramChannelScraper('AKCPB') +@@ -392,8 +392,8 @@ def scrape_small_channel(): + + self.execute_with_timeout(scrape_small_channel) + +- def test_scraping_termination_channels_without_post_id_one(self): +- """Test scraping gracefully handles channels missing a post where id=1.""" ++ def test_scraping_termination_pages_without_posts(self): ++ """Test scraping gracefully handles pages without any posts.""" + + def scrape_empty_page(): + scraper = TelegramChannelScraper('BREAKDCODE?before=3') +@@ -407,10 +407,11 @@ def test_media_order_preservation(self): + scraper = TelegramChannelScraper('nexta_live?before=43103') + item = next(scraper.get_items(), None) + self.assertIsNotNone(item, "Failed to scrape any posts.") ++ ++ # This particular post is known to include media [Video, Photo, Video] + self.assertEqual(item.url, "https://t.me/s/nexta_live/43102") + +- # Directly validate the types of the objects in the media array +- expected_types = [Video, Photo, Video] # Adjust based on expected types ++ expected_types = [Video, Photo, Video] + actual_types = [type(media) for media in item.media] if item.media else [] + + self.assertEqual(actual_types, expected_types, "Media did not appear in the expected order.") -- cgit v1.2.3