From 972a1be7194b0e36e7b48ad38d89b4ca2340a1ab Mon Sep 17 00:00:00 2001 From: msglm Date: Thu, 23 May 2024 05:17:09 -0500 Subject: Revert "Revert "remove snsscrape patch"" This reverts commit 91f5e47b21c2108f0031f339413904b84fdefec5. --- .../snscrape/snscrape-downloads-telegram.patch | 495 --------------------- 1 file changed, 495 deletions(-) delete mode 100644 aux-files/snscrape/snscrape-downloads-telegram.patch diff --git a/aux-files/snscrape/snscrape-downloads-telegram.patch b/aux-files/snscrape/snscrape-downloads-telegram.patch deleted file mode 100644 index 46665c4..0000000 --- a/aux-files/snscrape/snscrape-downloads-telegram.patch +++ /dev/null @@ -1,495 +0,0 @@ -From 00239388e3096277a55271a8786b4b5d6d2bec84 Mon Sep 17 00:00:00 2001 -From: John O'Sullivan -Date: Thu, 18 Jan 2024 11:37:32 -0500 -Subject: [PATCH 1/8] WIP: Fixed 2.5 out of 5 issues mentioned in PR - ---- - snscrape/base.py | 1 + - snscrape/modules/telegram.py | 12 ++++++------ - 2 files changed, 7 insertions(+), 6 deletions(-) - -diff --git a/snscrape/base.py b/snscrape/base.py -index c9e75d9d..5ce5e1da 100644 ---- a/snscrape/base.py -+++ b/snscrape/base.py -@@ -193,6 +193,7 @@ def _request(self, method, url, params = None, data = None, headers = None, time - # The request is newly prepared on each retry because of potential cookie updates. - req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers)) - environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None) -+ _logger.info("Hey there, I'm in here") - _logger.info(f'Retrieving {req.url}') - _logger.debug(f'... with headers: {headers!r}') - if data: -diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py -index 4e977656..54345d96 100644 ---- a/snscrape/modules/telegram.py -+++ b/snscrape/modules/telegram.py -@@ -196,6 +196,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): - } - timeTag = videoPlayer.find('time') - if timeTag is None: -+ _logger.warning(f'Could not find duration for video or GIF at {url}') - cls = Gif - else: - cls = Video -@@ -219,8 +220,6 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): - else: - _logger.warning(f'Could not process link preview image on {url}') - linkPreview = LinkPreview(**kwargs) -- if kwargs['href'] in outlinks: -- outlinks.remove(kwargs['href']) - - viewsSpan = post.find('span', class_ = 'tgme_widget_message_views') - views = None if viewsSpan is None else _parse_num(viewsSpan.text) -@@ -239,13 +238,14 @@ def get_items(self): - return - nextPageUrl = '' - while True: -+ print("About to yield from get_items") - yield from self._soup_to_items(soup, r.url) -- try: -- if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1': -+ dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True) -+ if dateElt and 'href' in dateElt.attrs: -+ urlPieces = dateElt['href'].split('/') -+ if urlPieces and urlPieces[-1] == '1': - # if message 1 is the first message in the page, terminate scraping - break -- except: -- pass - pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True}) - if not pageLink: - # some pages are missing a "tme_messages_more" tag, causing early termination - -From 670905fedb64656b94c6fb920c8628d318171b64 Mon Sep 17 00:00:00 2001 -From: John O'Sullivan -Date: Thu, 18 Jan 2024 11:46:46 -0500 -Subject: [PATCH 2/8] Remove test log statement, add link to example GIF - ---- - snscrape/base.py | 1 - - snscrape/modules/telegram.py | 2 +- - 2 files changed, 1 insertion(+), 2 deletions(-) - -diff --git a/snscrape/base.py b/snscrape/base.py -index 5ce5e1da..c9e75d9d 100644 ---- a/snscrape/base.py -+++ b/snscrape/base.py -@@ -193,7 +193,6 @@ def _request(self, method, url, params = None, data = None, headers = None, time - # The request is newly prepared on each retry because of potential cookie updates. - req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers)) - environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None) -- _logger.info("Hey there, I'm in here") - _logger.info(f'Retrieving {req.url}') - _logger.debug(f'... with headers: {headers!r}') - if data: -diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py -index 54345d96..01e99318 100644 ---- a/snscrape/modules/telegram.py -+++ b/snscrape/modules/telegram.py -@@ -196,7 +196,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): - } - timeTag = videoPlayer.find('time') - if timeTag is None: -- _logger.warning(f'Could not find duration for video or GIF at {url}') -+ # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3 - cls = Gif - else: - cls = Video - -From 54df8832f5b5bc3af58c3faf953966a2070a834d Mon Sep 17 00:00:00 2001 -From: John O'Sullivan -Date: Thu, 22 Feb 2024 01:06:04 -0500 -Subject: [PATCH 3/8] Added media processing into main link loop; using prev - tag to get page, rather than index math - ---- - snscrape/modules/telegram.py | 84 +++++++++++++++++++----------------- - 1 file changed, 44 insertions(+), 40 deletions(-) - -diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py -index 01e99318..b4f3d78e 100644 ---- a/snscrape/modules/telegram.py -+++ b/snscrape/modules/telegram.py -@@ -152,7 +152,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): - imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style) - if len(imageUrls) == 1: - media.append(Photo(url = imageUrls[0])) -- continue -+ - if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']): - style = link.attrs.get('style', '') - imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style) -@@ -161,49 +161,23 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): - # resp = self._get(image[0]) - # encoded_string = base64.b64encode(resp.content) - # Individual photo or video link -- continue -+ - if link.text.startswith('@'): - mentions.append(link.text.strip('@')) -- continue -+ - if link.text.startswith('#'): - hashtags.append(link.text.strip('#')) -- continue -+ -+ if 'tgme_widget_message_voice_player' in link.get('class', []): -+ media.append(_parse_voice_message(link)) -+ -+ if 'tgme_widget_message_video_player' in link.get('class', []): -+ media.append(_parse_video_message(link)) -+ - href = urllib.parse.urljoin(pageUrl, link['href']) - if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl): - outlinks.append(href) - -- for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}): -- audioUrl = voicePlayer.find('audio')['src'] -- durationStr = voicePlayer.find('time').text -- duration = _durationStrToSeconds(durationStr) -- barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')] -- -- media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights)) -- -- for videoPlayer in post.find_all('a', {'class': 'tgme_widget_message_video_player'}): -- iTag = videoPlayer.find('i') -- if iTag is None: -- videoUrl = None -- videoThumbnailUrl = None -- else: -- style = iTag['style'] -- videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0] -- videoTag = videoPlayer.find('video') -- videoUrl = None if videoTag is None else videoTag['src'] -- mKwargs = { -- 'thumbnailUrl': videoThumbnailUrl, -- 'url': videoUrl, -- } -- timeTag = videoPlayer.find('time') -- if timeTag is None: -- # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3 -- cls = Gif -- else: -- cls = Video -- durationStr = videoPlayer.find('time').text -- mKwargs['duration'] = _durationStrToSeconds(durationStr) -- media.append(cls(**mKwargs)) -- - linkPreview = None - if (linkPreviewA := post.find('a', class_ = 'tgme_widget_message_link_preview')): - kwargs = {} -@@ -250,10 +224,10 @@ def get_items(self): - if not pageLink: - # some pages are missing a "tme_messages_more" tag, causing early termination - if '=' not in nextPageUrl: -- nextPageUrl = soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href'] -- nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20 -+ nextPageUrl = soup.find('link', attrs = {'rel': 'prev'}, href = True)['href'] -+ nextPostIndex = int(nextPageUrl.split('=')[-1]) - if nextPostIndex > 20: -- pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'} -+ pageLink = {'href': nextPageUrl} - else: - break - nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href']) -@@ -333,4 +307,34 @@ def _telegramResponseOkCallback(r): - if r.status_code == 200: - return (True, None) - return (False, f'{r.status_code=}') -- -\ No newline at end of file -+ -+def _parse_voice_message(voicePlayer): -+ audioUrl = voicePlayer.find('audio')['src'] -+ durationStr = voicePlayer.find('time').text -+ duration = _durationStrToSeconds(durationStr) -+ barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')] -+ return VoiceMessage(url = audioUrl, duration = duration, bars = barHeights) -+ -+def _parse_video_message(videoPlayer): -+ iTag = videoPlayer.find('i') -+ if iTag is None: -+ videoUrl = None -+ videoThumbnailUrl = None -+ else: -+ style = iTag['style'] -+ videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0] -+ videoTag = videoPlayer.find('video') -+ videoUrl = None if videoTag is None else videoTag['src'] -+ mKwargs = { -+ 'thumbnailUrl': videoThumbnailUrl, -+ 'url': videoUrl, -+ } -+ timeTag = videoPlayer.find('time') -+ if timeTag is None: -+ # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3 -+ cls = Gif -+ else: -+ cls = Video -+ durationStr = videoPlayer.find('time').text -+ mKwargs['duration'] = _durationStrToSeconds(durationStr) -+ return cls(**mKwargs) -\ No newline at end of file - -From 2dfd1542f19bbadad603e00e61712943542fbfe1 Mon Sep 17 00:00:00 2001 -From: John O'Sullivan -Date: Thu, 22 Feb 2024 01:07:46 -0500 -Subject: [PATCH 4/8] Forgot to remove a test log - ---- - snscrape/modules/telegram.py | 1 - - 1 file changed, 1 deletion(-) - -diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py -index b4f3d78e..8f6d18d7 100644 ---- a/snscrape/modules/telegram.py -+++ b/snscrape/modules/telegram.py -@@ -212,7 +212,6 @@ def get_items(self): - return - nextPageUrl = '' - while True: -- print("About to yield from get_items") - yield from self._soup_to_items(soup, r.url) - dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True) - if dateElt and 'href' in dateElt.attrs: - -From a93f6a3fad0d19209a49c7b730fea73659743774 Mon Sep 17 00:00:00 2001 -From: John O'Sullivan -Date: Fri, 1 Mar 2024 12:51:26 -0500 -Subject: [PATCH 5/8] Applying trislee's suggested fix for getting nextPageUrl - ---- - snscrape/modules/telegram.py | 16 +++++----------- - 1 file changed, 5 insertions(+), 11 deletions(-) - -diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py -index 8f6d18d7..ac0feef8 100644 ---- a/snscrape/modules/telegram.py -+++ b/snscrape/modules/telegram.py -@@ -219,17 +219,11 @@ def get_items(self): - if urlPieces and urlPieces[-1] == '1': - # if message 1 is the first message in the page, terminate scraping - break -- pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True}) -- if not pageLink: -- # some pages are missing a "tme_messages_more" tag, causing early termination -- if '=' not in nextPageUrl: -- nextPageUrl = soup.find('link', attrs = {'rel': 'prev'}, href = True)['href'] -- nextPostIndex = int(nextPageUrl.split('=')[-1]) -- if nextPostIndex > 20: -- pageLink = {'href': nextPageUrl} -- else: -- break -- nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href']) -+ if pageLink := soup.find('link', attrs = {'rel': 'prev'}, href = True): -+ nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href']) -+ else: -+ nextPostIndex = int(soup.find('div', attrs = {'class': 'tgme_widget_message', 'data-post': True})["data-post"].split("/")[-1]) -+ nextPageUrl = urllib.parse.urljoin(r.url, r.url.split('?')[0] + f'?before={nextPostIndex}') - r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback) - if r.status_code != 200: - raise snscrape.base.ScraperException(f'Got status code {r.status_code}') - -From a542aa57598f94f69fd7b69789e97045e92133da Mon Sep 17 00:00:00 2001 -From: John O'Sullivan -Date: Thu, 14 Mar 2024 01:50:38 -0400 -Subject: [PATCH 6/8] Ensured termination on channels w/o an id=1 post, wrote - test cases to prevent regression - ---- - snscrape/modules/telegram.py | 87 +++++++++++++++++++++++++++++++++++- - 1 file changed, 86 insertions(+), 1 deletion(-) - -diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py -index ac0feef8..7a85cb58 100644 ---- a/snscrape/modules/telegram.py -+++ b/snscrape/modules/telegram.py -@@ -9,6 +9,8 @@ - import snscrape.base - import typing - import urllib.parse -+import unittest -+import threading - - _logger = logging.getLogger(__name__) - _SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$') -@@ -212,6 +214,8 @@ def get_items(self): - return - nextPageUrl = '' - while True: -+ if soup.find("div", class_ = "tme_no_messages_found"): -+ break - yield from self._soup_to_items(soup, r.url) - dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True) - if dateElt and 'href' in dateElt.attrs: -@@ -330,4 +334,85 @@ def _parse_video_message(videoPlayer): - cls = Video - durationStr = videoPlayer.find('time').text - mKwargs['duration'] = _durationStrToSeconds(durationStr) -- return cls(**mKwargs) -\ No newline at end of file -+ return cls(**mKwargs) -+ -+class TestTelegramChannelScraper(unittest.TestCase): -+ -+ @staticmethod -+ def execute_with_timeout(func, timeout=10): -+ """ -+ Executes a function in a separate thread and enforces a timeout. -+ If provided function throws an error, it's re-raised in main thread. -+ Used to detect infinite loops in finite time, works cross-platform. -+ -+ :param func: The function to execute. This function should accept no arguments. -+ :param timeout: The timeout in seconds. -+ """ -+ exceptions=[] -+ def func_passing_exceptions(): -+ try: -+ func() -+ except Exception as e: -+ exceptions.append((e.__class__, e, e.__traceback__)) -+ -+ thread = threading.Thread(target=func_passing_exceptions) -+ thread.start() -+ thread.join(timeout=timeout) -+ -+ if exceptions: -+ exc_class, exc_instance, traceback = exceptions[0] -+ raise exc_class(exc_instance).with_traceback(traceback) -+ -+ if thread.is_alive(): -+ raise TimeoutError(f"Function didn't complete within {timeout} seconds") -+ -+ def test_scraping_termination_missing_prev(self): -+ """Test scraping always terminates, even if the page's prev link is missing.""" -+ -+ def scrape_two_pages(): -+ scraper = TelegramChannelScraper('WLM_USA_TEXAS?before=3766') -+ items = list() -+ num_items_on_page = 20 -+ for item in scraper.get_items(): -+ items.append(item) -+ if len(items) > 2 * num_items_on_page: -+ break -+ -+ self.execute_with_timeout(scrape_two_pages) -+ -+ def test_scraping_termination_small_post_count(self): -+ """Test scraping always terminates, even with small number of posts. This channel has only 28.""" -+ -+ def scrape_small_channel(): -+ scraper = TelegramChannelScraper('AKCPB') -+ items = list(scraper.get_items()) -+ return items -+ -+ self.execute_with_timeout(scrape_small_channel) -+ -+ def test_scraping_termination_channels_without_post_id_one(self): -+ """Test scraping gracefully handles channels missing a post where id=1.""" -+ -+ def scrape_empty_page(): -+ scraper = TelegramChannelScraper('BREAKDCODE?before=3') -+ for _ in scraper.get_items(): -+ pass -+ -+ self.execute_with_timeout(scrape_empty_page) -+ -+ def test_media_order_preservation(self): -+ """Test scraped media appears in the same order as in the post.""" -+ scraper = TelegramChannelScraper('nexta_live?before=43103') -+ item = next(scraper.get_items(), None) -+ self.assertIsNotNone(item, "Failed to scrape any posts.") -+ self.assertEqual(item.url, "https://t.me/s/nexta_live/43102") -+ -+ # Directly validate the types of the objects in the media array -+ expected_types = [Video, Photo, Video] # Adjust based on expected types -+ actual_types = [type(media) for media in item.media] if item.media else [] -+ -+ self.assertEqual(actual_types, expected_types, "Media did not appear in the expected order.") -+ -+ -+if __name__ == '__main__': -+ unittest.main() - -From 7d061cb5279e153f829340f848bc4ba01d716f26 Mon Sep 17 00:00:00 2001 -From: John O'Sullivan -Date: Thu, 14 Mar 2024 01:55:16 -0400 -Subject: [PATCH 7/8] Add docstring saying suite should run by directly running - file - ---- - snscrape/modules/telegram.py | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py -index 7a85cb58..c6e0b0ee 100644 ---- a/snscrape/modules/telegram.py -+++ b/snscrape/modules/telegram.py -@@ -336,7 +336,9 @@ def _parse_video_message(videoPlayer): - mKwargs['duration'] = _durationStrToSeconds(durationStr) - return cls(**mKwargs) - -+ - class TestTelegramChannelScraper(unittest.TestCase): -+ """Run suite by directly calling this file.""" - - @staticmethod - def execute_with_timeout(func, timeout=10): - -From 9309b1b01c6db15862809623e2c5adddecd894be Mon Sep 17 00:00:00 2001 -From: John O'Sullivan -Date: Thu, 14 Mar 2024 02:00:50 -0400 -Subject: [PATCH 8/8] Correct some inaccurate test descriptions - ---- - snscrape/modules/telegram.py | 13 +++++++------ - 1 file changed, 7 insertions(+), 6 deletions(-) - -diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py -index c6e0b0ee..dbf0f9b3 100644 ---- a/snscrape/modules/telegram.py -+++ b/snscrape/modules/telegram.py -@@ -338,7 +338,7 @@ def _parse_video_message(videoPlayer): - - - class TestTelegramChannelScraper(unittest.TestCase): -- """Run suite by directly calling this file.""" -+ """Run suite by directly running this file.""" - - @staticmethod - def execute_with_timeout(func, timeout=10): -@@ -383,7 +383,7 @@ def scrape_two_pages(): - self.execute_with_timeout(scrape_two_pages) - - def test_scraping_termination_small_post_count(self): -- """Test scraping always terminates, even with small number of posts. This channel has only 28.""" -+ """Test scraping always terminates, even with small number of posts. This channel's highest ID is 28.""" - - def scrape_small_channel(): - scraper = TelegramChannelScraper('AKCPB') -@@ -392,8 +392,8 @@ def scrape_small_channel(): - - self.execute_with_timeout(scrape_small_channel) - -- def test_scraping_termination_channels_without_post_id_one(self): -- """Test scraping gracefully handles channels missing a post where id=1.""" -+ def test_scraping_termination_pages_without_posts(self): -+ """Test scraping gracefully handles pages without any posts.""" - - def scrape_empty_page(): - scraper = TelegramChannelScraper('BREAKDCODE?before=3') -@@ -407,10 +407,11 @@ def test_media_order_preservation(self): - scraper = TelegramChannelScraper('nexta_live?before=43103') - item = next(scraper.get_items(), None) - self.assertIsNotNone(item, "Failed to scrape any posts.") -+ -+ # This particular post is known to include media [Video, Photo, Video] - self.assertEqual(item.url, "https://t.me/s/nexta_live/43102") - -- # Directly validate the types of the objects in the media array -- expected_types = [Video, Photo, Video] # Adjust based on expected types -+ expected_types = [Video, Photo, Video] - actual_types = [type(media) for media in item.media] if item.media else [] - - self.assertEqual(actual_types, expected_types, "Media did not appear in the expected order.") -- cgit v1.2.3