From 91f5e47b21c2108f0031f339413904b84fdefec5 Mon Sep 17 00:00:00 2001 From: msglm Date: Thu, 23 May 2024 05:16:36 -0500 Subject: Revert "remove snsscrape patch" This reverts commit 12a4266ca2a4916e16cb387a4f65dbea0f217d6f. --- .../snscrape/snscrape-downloads-telegram.patch | 495 +++++++++++++++++++++ 1 file changed, 495 insertions(+) create mode 100644 aux-files/snscrape/snscrape-downloads-telegram.patch diff --git a/aux-files/snscrape/snscrape-downloads-telegram.patch b/aux-files/snscrape/snscrape-downloads-telegram.patch new file mode 100644 index 0000000..46665c4 --- /dev/null +++ b/aux-files/snscrape/snscrape-downloads-telegram.patch @@ -0,0 +1,495 @@ +From 00239388e3096277a55271a8786b4b5d6d2bec84 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Thu, 18 Jan 2024 11:37:32 -0500 +Subject: [PATCH 1/8] WIP: Fixed 2.5 out of 5 issues mentioned in PR + +--- + snscrape/base.py | 1 + + snscrape/modules/telegram.py | 12 ++++++------ + 2 files changed, 7 insertions(+), 6 deletions(-) + +diff --git a/snscrape/base.py b/snscrape/base.py +index c9e75d9d..5ce5e1da 100644 +--- a/snscrape/base.py ++++ b/snscrape/base.py +@@ -193,6 +193,7 @@ def _request(self, method, url, params = None, data = None, headers = None, time + # The request is newly prepared on each retry because of potential cookie updates. + req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers)) + environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None) ++ _logger.info("Hey there, I'm in here") + _logger.info(f'Retrieving {req.url}') + _logger.debug(f'... with headers: {headers!r}') + if data: +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 4e977656..54345d96 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -196,6 +196,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + } + timeTag = videoPlayer.find('time') + if timeTag is None: ++ _logger.warning(f'Could not find duration for video or GIF at {url}') + cls = Gif + else: + cls = Video +@@ -219,8 +220,6 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + else: + _logger.warning(f'Could not process link preview image on {url}') + linkPreview = LinkPreview(**kwargs) +- if kwargs['href'] in outlinks: +- outlinks.remove(kwargs['href']) + + viewsSpan = post.find('span', class_ = 'tgme_widget_message_views') + views = None if viewsSpan is None else _parse_num(viewsSpan.text) +@@ -239,13 +238,14 @@ def get_items(self): + return + nextPageUrl = '' + while True: ++ print("About to yield from get_items") + yield from self._soup_to_items(soup, r.url) +- try: +- if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1': ++ dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True) ++ if dateElt and 'href' in dateElt.attrs: ++ urlPieces = dateElt['href'].split('/') ++ if urlPieces and urlPieces[-1] == '1': + # if message 1 is the first message in the page, terminate scraping + break +- except: +- pass + pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True}) + if not pageLink: + # some pages are missing a "tme_messages_more" tag, causing early termination + +From 670905fedb64656b94c6fb920c8628d318171b64 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Thu, 18 Jan 2024 11:46:46 -0500 +Subject: [PATCH 2/8] Remove test log statement, add link to example GIF + +--- + snscrape/base.py | 1 - + snscrape/modules/telegram.py | 2 +- + 2 files changed, 1 insertion(+), 2 deletions(-) + +diff --git a/snscrape/base.py b/snscrape/base.py +index 5ce5e1da..c9e75d9d 100644 +--- a/snscrape/base.py ++++ b/snscrape/base.py +@@ -193,7 +193,6 @@ def _request(self, method, url, params = None, data = None, headers = None, time + # The request is newly prepared on each retry because of potential cookie updates. + req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers)) + environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None) +- _logger.info("Hey there, I'm in here") + _logger.info(f'Retrieving {req.url}') + _logger.debug(f'... with headers: {headers!r}') + if data: +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 54345d96..01e99318 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -196,7 +196,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + } + timeTag = videoPlayer.find('time') + if timeTag is None: +- _logger.warning(f'Could not find duration for video or GIF at {url}') ++ # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3 + cls = Gif + else: + cls = Video + +From 54df8832f5b5bc3af58c3faf953966a2070a834d Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Thu, 22 Feb 2024 01:06:04 -0500 +Subject: [PATCH 3/8] Added media processing into main link loop; using prev + tag to get page, rather than index math + +--- + snscrape/modules/telegram.py | 84 +++++++++++++++++++----------------- + 1 file changed, 44 insertions(+), 40 deletions(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 01e99318..b4f3d78e 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -152,7 +152,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style) + if len(imageUrls) == 1: + media.append(Photo(url = imageUrls[0])) +- continue ++ + if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']): + style = link.attrs.get('style', '') + imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style) +@@ -161,49 +161,23 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + # resp = self._get(image[0]) + # encoded_string = base64.b64encode(resp.content) + # Individual photo or video link +- continue ++ + if link.text.startswith('@'): + mentions.append(link.text.strip('@')) +- continue ++ + if link.text.startswith('#'): + hashtags.append(link.text.strip('#')) +- continue ++ ++ if 'tgme_widget_message_voice_player' in link.get('class', []): ++ media.append(_parse_voice_message(link)) ++ ++ if 'tgme_widget_message_video_player' in link.get('class', []): ++ media.append(_parse_video_message(link)) ++ + href = urllib.parse.urljoin(pageUrl, link['href']) + if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl): + outlinks.append(href) + +- for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}): +- audioUrl = voicePlayer.find('audio')['src'] +- durationStr = voicePlayer.find('time').text +- duration = _durationStrToSeconds(durationStr) +- barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')] +- +- media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights)) +- +- for videoPlayer in post.find_all('a', {'class': 'tgme_widget_message_video_player'}): +- iTag = videoPlayer.find('i') +- if iTag is None: +- videoUrl = None +- videoThumbnailUrl = None +- else: +- style = iTag['style'] +- videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0] +- videoTag = videoPlayer.find('video') +- videoUrl = None if videoTag is None else videoTag['src'] +- mKwargs = { +- 'thumbnailUrl': videoThumbnailUrl, +- 'url': videoUrl, +- } +- timeTag = videoPlayer.find('time') +- if timeTag is None: +- # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3 +- cls = Gif +- else: +- cls = Video +- durationStr = videoPlayer.find('time').text +- mKwargs['duration'] = _durationStrToSeconds(durationStr) +- media.append(cls(**mKwargs)) +- + linkPreview = None + if (linkPreviewA := post.find('a', class_ = 'tgme_widget_message_link_preview')): + kwargs = {} +@@ -250,10 +224,10 @@ def get_items(self): + if not pageLink: + # some pages are missing a "tme_messages_more" tag, causing early termination + if '=' not in nextPageUrl: +- nextPageUrl = soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href'] +- nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20 ++ nextPageUrl = soup.find('link', attrs = {'rel': 'prev'}, href = True)['href'] ++ nextPostIndex = int(nextPageUrl.split('=')[-1]) + if nextPostIndex > 20: +- pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'} ++ pageLink = {'href': nextPageUrl} + else: + break + nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href']) +@@ -333,4 +307,34 @@ def _telegramResponseOkCallback(r): + if r.status_code == 200: + return (True, None) + return (False, f'{r.status_code=}') +- +\ No newline at end of file ++ ++def _parse_voice_message(voicePlayer): ++ audioUrl = voicePlayer.find('audio')['src'] ++ durationStr = voicePlayer.find('time').text ++ duration = _durationStrToSeconds(durationStr) ++ barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')] ++ return VoiceMessage(url = audioUrl, duration = duration, bars = barHeights) ++ ++def _parse_video_message(videoPlayer): ++ iTag = videoPlayer.find('i') ++ if iTag is None: ++ videoUrl = None ++ videoThumbnailUrl = None ++ else: ++ style = iTag['style'] ++ videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0] ++ videoTag = videoPlayer.find('video') ++ videoUrl = None if videoTag is None else videoTag['src'] ++ mKwargs = { ++ 'thumbnailUrl': videoThumbnailUrl, ++ 'url': videoUrl, ++ } ++ timeTag = videoPlayer.find('time') ++ if timeTag is None: ++ # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3 ++ cls = Gif ++ else: ++ cls = Video ++ durationStr = videoPlayer.find('time').text ++ mKwargs['duration'] = _durationStrToSeconds(durationStr) ++ return cls(**mKwargs) +\ No newline at end of file + +From 2dfd1542f19bbadad603e00e61712943542fbfe1 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Thu, 22 Feb 2024 01:07:46 -0500 +Subject: [PATCH 4/8] Forgot to remove a test log + +--- + snscrape/modules/telegram.py | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index b4f3d78e..8f6d18d7 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -212,7 +212,6 @@ def get_items(self): + return + nextPageUrl = '' + while True: +- print("About to yield from get_items") + yield from self._soup_to_items(soup, r.url) + dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True) + if dateElt and 'href' in dateElt.attrs: + +From a93f6a3fad0d19209a49c7b730fea73659743774 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Fri, 1 Mar 2024 12:51:26 -0500 +Subject: [PATCH 5/8] Applying trislee's suggested fix for getting nextPageUrl + +--- + snscrape/modules/telegram.py | 16 +++++----------- + 1 file changed, 5 insertions(+), 11 deletions(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 8f6d18d7..ac0feef8 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -219,17 +219,11 @@ def get_items(self): + if urlPieces and urlPieces[-1] == '1': + # if message 1 is the first message in the page, terminate scraping + break +- pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True}) +- if not pageLink: +- # some pages are missing a "tme_messages_more" tag, causing early termination +- if '=' not in nextPageUrl: +- nextPageUrl = soup.find('link', attrs = {'rel': 'prev'}, href = True)['href'] +- nextPostIndex = int(nextPageUrl.split('=')[-1]) +- if nextPostIndex > 20: +- pageLink = {'href': nextPageUrl} +- else: +- break +- nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href']) ++ if pageLink := soup.find('link', attrs = {'rel': 'prev'}, href = True): ++ nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href']) ++ else: ++ nextPostIndex = int(soup.find('div', attrs = {'class': 'tgme_widget_message', 'data-post': True})["data-post"].split("/")[-1]) ++ nextPageUrl = urllib.parse.urljoin(r.url, r.url.split('?')[0] + f'?before={nextPostIndex}') + r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback) + if r.status_code != 200: + raise snscrape.base.ScraperException(f'Got status code {r.status_code}') + +From a542aa57598f94f69fd7b69789e97045e92133da Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Thu, 14 Mar 2024 01:50:38 -0400 +Subject: [PATCH 6/8] Ensured termination on channels w/o an id=1 post, wrote + test cases to prevent regression + +--- + snscrape/modules/telegram.py | 87 +++++++++++++++++++++++++++++++++++- + 1 file changed, 86 insertions(+), 1 deletion(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index ac0feef8..7a85cb58 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -9,6 +9,8 @@ + import snscrape.base + import typing + import urllib.parse ++import unittest ++import threading + + _logger = logging.getLogger(__name__) + _SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$') +@@ -212,6 +214,8 @@ def get_items(self): + return + nextPageUrl = '' + while True: ++ if soup.find("div", class_ = "tme_no_messages_found"): ++ break + yield from self._soup_to_items(soup, r.url) + dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True) + if dateElt and 'href' in dateElt.attrs: +@@ -330,4 +334,85 @@ def _parse_video_message(videoPlayer): + cls = Video + durationStr = videoPlayer.find('time').text + mKwargs['duration'] = _durationStrToSeconds(durationStr) +- return cls(**mKwargs) +\ No newline at end of file ++ return cls(**mKwargs) ++ ++class TestTelegramChannelScraper(unittest.TestCase): ++ ++ @staticmethod ++ def execute_with_timeout(func, timeout=10): ++ """ ++ Executes a function in a separate thread and enforces a timeout. ++ If provided function throws an error, it's re-raised in main thread. ++ Used to detect infinite loops in finite time, works cross-platform. ++ ++ :param func: The function to execute. This function should accept no arguments. ++ :param timeout: The timeout in seconds. ++ """ ++ exceptions=[] ++ def func_passing_exceptions(): ++ try: ++ func() ++ except Exception as e: ++ exceptions.append((e.__class__, e, e.__traceback__)) ++ ++ thread = threading.Thread(target=func_passing_exceptions) ++ thread.start() ++ thread.join(timeout=timeout) ++ ++ if exceptions: ++ exc_class, exc_instance, traceback = exceptions[0] ++ raise exc_class(exc_instance).with_traceback(traceback) ++ ++ if thread.is_alive(): ++ raise TimeoutError(f"Function didn't complete within {timeout} seconds") ++ ++ def test_scraping_termination_missing_prev(self): ++ """Test scraping always terminates, even if the page's prev link is missing.""" ++ ++ def scrape_two_pages(): ++ scraper = TelegramChannelScraper('WLM_USA_TEXAS?before=3766') ++ items = list() ++ num_items_on_page = 20 ++ for item in scraper.get_items(): ++ items.append(item) ++ if len(items) > 2 * num_items_on_page: ++ break ++ ++ self.execute_with_timeout(scrape_two_pages) ++ ++ def test_scraping_termination_small_post_count(self): ++ """Test scraping always terminates, even with small number of posts. This channel has only 28.""" ++ ++ def scrape_small_channel(): ++ scraper = TelegramChannelScraper('AKCPB') ++ items = list(scraper.get_items()) ++ return items ++ ++ self.execute_with_timeout(scrape_small_channel) ++ ++ def test_scraping_termination_channels_without_post_id_one(self): ++ """Test scraping gracefully handles channels missing a post where id=1.""" ++ ++ def scrape_empty_page(): ++ scraper = TelegramChannelScraper('BREAKDCODE?before=3') ++ for _ in scraper.get_items(): ++ pass ++ ++ self.execute_with_timeout(scrape_empty_page) ++ ++ def test_media_order_preservation(self): ++ """Test scraped media appears in the same order as in the post.""" ++ scraper = TelegramChannelScraper('nexta_live?before=43103') ++ item = next(scraper.get_items(), None) ++ self.assertIsNotNone(item, "Failed to scrape any posts.") ++ self.assertEqual(item.url, "https://t.me/s/nexta_live/43102") ++ ++ # Directly validate the types of the objects in the media array ++ expected_types = [Video, Photo, Video] # Adjust based on expected types ++ actual_types = [type(media) for media in item.media] if item.media else [] ++ ++ self.assertEqual(actual_types, expected_types, "Media did not appear in the expected order.") ++ ++ ++if __name__ == '__main__': ++ unittest.main() + +From 7d061cb5279e153f829340f848bc4ba01d716f26 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Thu, 14 Mar 2024 01:55:16 -0400 +Subject: [PATCH 7/8] Add docstring saying suite should run by directly running + file + +--- + snscrape/modules/telegram.py | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 7a85cb58..c6e0b0ee 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -336,7 +336,9 @@ def _parse_video_message(videoPlayer): + mKwargs['duration'] = _durationStrToSeconds(durationStr) + return cls(**mKwargs) + ++ + class TestTelegramChannelScraper(unittest.TestCase): ++ """Run suite by directly calling this file.""" + + @staticmethod + def execute_with_timeout(func, timeout=10): + +From 9309b1b01c6db15862809623e2c5adddecd894be Mon Sep 17 00:00:00 2001 +From: John O'Sullivan +Date: Thu, 14 Mar 2024 02:00:50 -0400 +Subject: [PATCH 8/8] Correct some inaccurate test descriptions + +--- + snscrape/modules/telegram.py | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index c6e0b0ee..dbf0f9b3 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -338,7 +338,7 @@ def _parse_video_message(videoPlayer): + + + class TestTelegramChannelScraper(unittest.TestCase): +- """Run suite by directly calling this file.""" ++ """Run suite by directly running this file.""" + + @staticmethod + def execute_with_timeout(func, timeout=10): +@@ -383,7 +383,7 @@ def scrape_two_pages(): + self.execute_with_timeout(scrape_two_pages) + + def test_scraping_termination_small_post_count(self): +- """Test scraping always terminates, even with small number of posts. This channel has only 28.""" ++ """Test scraping always terminates, even with small number of posts. This channel's highest ID is 28.""" + + def scrape_small_channel(): + scraper = TelegramChannelScraper('AKCPB') +@@ -392,8 +392,8 @@ def scrape_small_channel(): + + self.execute_with_timeout(scrape_small_channel) + +- def test_scraping_termination_channels_without_post_id_one(self): +- """Test scraping gracefully handles channels missing a post where id=1.""" ++ def test_scraping_termination_pages_without_posts(self): ++ """Test scraping gracefully handles pages without any posts.""" + + def scrape_empty_page(): + scraper = TelegramChannelScraper('BREAKDCODE?before=3') +@@ -407,10 +407,11 @@ def test_media_order_preservation(self): + scraper = TelegramChannelScraper('nexta_live?before=43103') + item = next(scraper.get_items(), None) + self.assertIsNotNone(item, "Failed to scrape any posts.") ++ ++ # This particular post is known to include media [Video, Photo, Video] + self.assertEqual(item.url, "https://t.me/s/nexta_live/43102") + +- # Directly validate the types of the objects in the media array +- expected_types = [Video, Photo, Video] # Adjust based on expected types ++ expected_types = [Video, Photo, Video] + actual_types = [type(media) for media in item.media] if item.media else [] + + self.assertEqual(actual_types, expected_types, "Media did not appear in the expected order.") -- cgit v1.2.3