summaryrefslogtreecommitdiffstats
path: root/aux-files
diff options
context:
space:
mode:
Diffstat (limited to 'aux-files')
-rw-r--r--aux-files/snscrape/snscrape-downloads-telegram.patch495
1 files changed, 0 insertions, 495 deletions
diff --git a/aux-files/snscrape/snscrape-downloads-telegram.patch b/aux-files/snscrape/snscrape-downloads-telegram.patch
deleted file mode 100644
index 46665c4..0000000
--- a/aux-files/snscrape/snscrape-downloads-telegram.patch
+++ /dev/null
@@ -1,495 +0,0 @@
-From 00239388e3096277a55271a8786b4b5d6d2bec84 Mon Sep 17 00:00:00 2001
-From: John O'Sullivan <j.osullivan42@gmail.com>
-Date: Thu, 18 Jan 2024 11:37:32 -0500
-Subject: [PATCH 1/8] WIP: Fixed 2.5 out of 5 issues mentioned in PR
-
----
- snscrape/base.py | 1 +
- snscrape/modules/telegram.py | 12 ++++++------
- 2 files changed, 7 insertions(+), 6 deletions(-)
-
-diff --git a/snscrape/base.py b/snscrape/base.py
-index c9e75d9d..5ce5e1da 100644
---- a/snscrape/base.py
-+++ b/snscrape/base.py
-@@ -193,6 +193,7 @@ def _request(self, method, url, params = None, data = None, headers = None, time
- # The request is newly prepared on each retry because of potential cookie updates.
- req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
- environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
-+ _logger.info("Hey there, I'm in here")
- _logger.info(f'Retrieving {req.url}')
- _logger.debug(f'... with headers: {headers!r}')
- if data:
-diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
-index 4e977656..54345d96 100644
---- a/snscrape/modules/telegram.py
-+++ b/snscrape/modules/telegram.py
-@@ -196,6 +196,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
- }
- timeTag = videoPlayer.find('time')
- if timeTag is None:
-+ _logger.warning(f'Could not find duration for video or GIF at {url}')
- cls = Gif
- else:
- cls = Video
-@@ -219,8 +220,6 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
- else:
- _logger.warning(f'Could not process link preview image on {url}')
- linkPreview = LinkPreview(**kwargs)
-- if kwargs['href'] in outlinks:
-- outlinks.remove(kwargs['href'])
-
- viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
- views = None if viewsSpan is None else _parse_num(viewsSpan.text)
-@@ -239,13 +238,14 @@ def get_items(self):
- return
- nextPageUrl = ''
- while True:
-+ print("About to yield from get_items")
- yield from self._soup_to_items(soup, r.url)
-- try:
-- if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1':
-+ dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)
-+ if dateElt and 'href' in dateElt.attrs:
-+ urlPieces = dateElt['href'].split('/')
-+ if urlPieces and urlPieces[-1] == '1':
- # if message 1 is the first message in the page, terminate scraping
- break
-- except:
-- pass
- pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
- if not pageLink:
- # some pages are missing a "tme_messages_more" tag, causing early termination
-
-From 670905fedb64656b94c6fb920c8628d318171b64 Mon Sep 17 00:00:00 2001
-From: John O'Sullivan <j.osullivan42@gmail.com>
-Date: Thu, 18 Jan 2024 11:46:46 -0500
-Subject: [PATCH 2/8] Remove test log statement, add link to example GIF
-
----
- snscrape/base.py | 1 -
- snscrape/modules/telegram.py | 2 +-
- 2 files changed, 1 insertion(+), 2 deletions(-)
-
-diff --git a/snscrape/base.py b/snscrape/base.py
-index 5ce5e1da..c9e75d9d 100644
---- a/snscrape/base.py
-+++ b/snscrape/base.py
-@@ -193,7 +193,6 @@ def _request(self, method, url, params = None, data = None, headers = None, time
- # The request is newly prepared on each retry because of potential cookie updates.
- req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
- environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
-- _logger.info("Hey there, I'm in here")
- _logger.info(f'Retrieving {req.url}')
- _logger.debug(f'... with headers: {headers!r}')
- if data:
-diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
-index 54345d96..01e99318 100644
---- a/snscrape/modules/telegram.py
-+++ b/snscrape/modules/telegram.py
-@@ -196,7 +196,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
- }
- timeTag = videoPlayer.find('time')
- if timeTag is None:
-- _logger.warning(f'Could not find duration for video or GIF at {url}')
-+ # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3
- cls = Gif
- else:
- cls = Video
-
-From 54df8832f5b5bc3af58c3faf953966a2070a834d Mon Sep 17 00:00:00 2001
-From: John O'Sullivan <j.osullivan42@gmail.com>
-Date: Thu, 22 Feb 2024 01:06:04 -0500
-Subject: [PATCH 3/8] Added media processing into main link loop; using prev
- tag to get page, rather than index math
-
----
- snscrape/modules/telegram.py | 84 +++++++++++++++++++-----------------
- 1 file changed, 44 insertions(+), 40 deletions(-)
-
-diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
-index 01e99318..b4f3d78e 100644
---- a/snscrape/modules/telegram.py
-+++ b/snscrape/modules/telegram.py
-@@ -152,7 +152,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
- imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
- if len(imageUrls) == 1:
- media.append(Photo(url = imageUrls[0]))
-- continue
-+
- if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']):
- style = link.attrs.get('style', '')
- imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
-@@ -161,49 +161,23 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
- # resp = self._get(image[0])
- # encoded_string = base64.b64encode(resp.content)
- # Individual photo or video link
-- continue
-+
- if link.text.startswith('@'):
- mentions.append(link.text.strip('@'))
-- continue
-+
- if link.text.startswith('#'):
- hashtags.append(link.text.strip('#'))
-- continue
-+
-+ if 'tgme_widget_message_voice_player' in link.get('class', []):
-+ media.append(_parse_voice_message(link))
-+
-+ if 'tgme_widget_message_video_player' in link.get('class', []):
-+ media.append(_parse_video_message(link))
-+
- href = urllib.parse.urljoin(pageUrl, link['href'])
- if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl):
- outlinks.append(href)
-
-- for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
-- audioUrl = voicePlayer.find('audio')['src']
-- durationStr = voicePlayer.find('time').text
-- duration = _durationStrToSeconds(durationStr)
-- barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
--
-- media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))
--
-- for videoPlayer in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
-- iTag = videoPlayer.find('i')
-- if iTag is None:
-- videoUrl = None
-- videoThumbnailUrl = None
-- else:
-- style = iTag['style']
-- videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0]
-- videoTag = videoPlayer.find('video')
-- videoUrl = None if videoTag is None else videoTag['src']
-- mKwargs = {
-- 'thumbnailUrl': videoThumbnailUrl,
-- 'url': videoUrl,
-- }
-- timeTag = videoPlayer.find('time')
-- if timeTag is None:
-- # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3
-- cls = Gif
-- else:
-- cls = Video
-- durationStr = videoPlayer.find('time').text
-- mKwargs['duration'] = _durationStrToSeconds(durationStr)
-- media.append(cls(**mKwargs))
--
- linkPreview = None
- if (linkPreviewA := post.find('a', class_ = 'tgme_widget_message_link_preview')):
- kwargs = {}
-@@ -250,10 +224,10 @@ def get_items(self):
- if not pageLink:
- # some pages are missing a "tme_messages_more" tag, causing early termination
- if '=' not in nextPageUrl:
-- nextPageUrl = soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href']
-- nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20
-+ nextPageUrl = soup.find('link', attrs = {'rel': 'prev'}, href = True)['href']
-+ nextPostIndex = int(nextPageUrl.split('=')[-1])
- if nextPostIndex > 20:
-- pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'}
-+ pageLink = {'href': nextPageUrl}
- else:
- break
- nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
-@@ -333,4 +307,34 @@ def _telegramResponseOkCallback(r):
- if r.status_code == 200:
- return (True, None)
- return (False, f'{r.status_code=}')
--
-\ No newline at end of file
-+
-+def _parse_voice_message(voicePlayer):
-+ audioUrl = voicePlayer.find('audio')['src']
-+ durationStr = voicePlayer.find('time').text
-+ duration = _durationStrToSeconds(durationStr)
-+ barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
-+ return VoiceMessage(url = audioUrl, duration = duration, bars = barHeights)
-+
-+def _parse_video_message(videoPlayer):
-+ iTag = videoPlayer.find('i')
-+ if iTag is None:
-+ videoUrl = None
-+ videoThumbnailUrl = None
-+ else:
-+ style = iTag['style']
-+ videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0]
-+ videoTag = videoPlayer.find('video')
-+ videoUrl = None if videoTag is None else videoTag['src']
-+ mKwargs = {
-+ 'thumbnailUrl': videoThumbnailUrl,
-+ 'url': videoUrl,
-+ }
-+ timeTag = videoPlayer.find('time')
-+ if timeTag is None:
-+ # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3
-+ cls = Gif
-+ else:
-+ cls = Video
-+ durationStr = videoPlayer.find('time').text
-+ mKwargs['duration'] = _durationStrToSeconds(durationStr)
-+ return cls(**mKwargs)
-\ No newline at end of file
-
-From 2dfd1542f19bbadad603e00e61712943542fbfe1 Mon Sep 17 00:00:00 2001
-From: John O'Sullivan <j.osullivan42@gmail.com>
-Date: Thu, 22 Feb 2024 01:07:46 -0500
-Subject: [PATCH 4/8] Forgot to remove a test log
-
----
- snscrape/modules/telegram.py | 1 -
- 1 file changed, 1 deletion(-)
-
-diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
-index b4f3d78e..8f6d18d7 100644
---- a/snscrape/modules/telegram.py
-+++ b/snscrape/modules/telegram.py
-@@ -212,7 +212,6 @@ def get_items(self):
- return
- nextPageUrl = ''
- while True:
-- print("About to yield from get_items")
- yield from self._soup_to_items(soup, r.url)
- dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)
- if dateElt and 'href' in dateElt.attrs:
-
-From a93f6a3fad0d19209a49c7b730fea73659743774 Mon Sep 17 00:00:00 2001
-From: John O'Sullivan <j.osullivan42@gmail.com>
-Date: Fri, 1 Mar 2024 12:51:26 -0500
-Subject: [PATCH 5/8] Applying trislee's suggested fix for getting nextPageUrl
-
----
- snscrape/modules/telegram.py | 16 +++++-----------
- 1 file changed, 5 insertions(+), 11 deletions(-)
-
-diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
-index 8f6d18d7..ac0feef8 100644
---- a/snscrape/modules/telegram.py
-+++ b/snscrape/modules/telegram.py
-@@ -219,17 +219,11 @@ def get_items(self):
- if urlPieces and urlPieces[-1] == '1':
- # if message 1 is the first message in the page, terminate scraping
- break
-- pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
-- if not pageLink:
-- # some pages are missing a "tme_messages_more" tag, causing early termination
-- if '=' not in nextPageUrl:
-- nextPageUrl = soup.find('link', attrs = {'rel': 'prev'}, href = True)['href']
-- nextPostIndex = int(nextPageUrl.split('=')[-1])
-- if nextPostIndex > 20:
-- pageLink = {'href': nextPageUrl}
-- else:
-- break
-- nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
-+ if pageLink := soup.find('link', attrs = {'rel': 'prev'}, href = True):
-+ nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
-+ else:
-+ nextPostIndex = int(soup.find('div', attrs = {'class': 'tgme_widget_message', 'data-post': True})["data-post"].split("/")[-1])
-+ nextPageUrl = urllib.parse.urljoin(r.url, r.url.split('?')[0] + f'?before={nextPostIndex}')
- r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback)
- if r.status_code != 200:
- raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
-
-From a542aa57598f94f69fd7b69789e97045e92133da Mon Sep 17 00:00:00 2001
-From: John O'Sullivan <j.osullivan42@gmail.com>
-Date: Thu, 14 Mar 2024 01:50:38 -0400
-Subject: [PATCH 6/8] Ensured termination on channels w/o an id=1 post, wrote
- test cases to prevent regression
-
----
- snscrape/modules/telegram.py | 87 +++++++++++++++++++++++++++++++++++-
- 1 file changed, 86 insertions(+), 1 deletion(-)
-
-diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
-index ac0feef8..7a85cb58 100644
---- a/snscrape/modules/telegram.py
-+++ b/snscrape/modules/telegram.py
-@@ -9,6 +9,8 @@
- import snscrape.base
- import typing
- import urllib.parse
-+import unittest
-+import threading
-
- _logger = logging.getLogger(__name__)
- _SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
-@@ -212,6 +214,8 @@ def get_items(self):
- return
- nextPageUrl = ''
- while True:
-+ if soup.find("div", class_ = "tme_no_messages_found"):
-+ break
- yield from self._soup_to_items(soup, r.url)
- dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)
- if dateElt and 'href' in dateElt.attrs:
-@@ -330,4 +334,85 @@ def _parse_video_message(videoPlayer):
- cls = Video
- durationStr = videoPlayer.find('time').text
- mKwargs['duration'] = _durationStrToSeconds(durationStr)
-- return cls(**mKwargs)
-\ No newline at end of file
-+ return cls(**mKwargs)
-+
-+class TestTelegramChannelScraper(unittest.TestCase):
-+
-+ @staticmethod
-+ def execute_with_timeout(func, timeout=10):
-+ """
-+ Executes a function in a separate thread and enforces a timeout.
-+ If provided function throws an error, it's re-raised in main thread.
-+ Used to detect infinite loops in finite time, works cross-platform.
-+
-+ :param func: The function to execute. This function should accept no arguments.
-+ :param timeout: The timeout in seconds.
-+ """
-+ exceptions=[]
-+ def func_passing_exceptions():
-+ try:
-+ func()
-+ except Exception as e:
-+ exceptions.append((e.__class__, e, e.__traceback__))
-+
-+ thread = threading.Thread(target=func_passing_exceptions)
-+ thread.start()
-+ thread.join(timeout=timeout)
-+
-+ if exceptions:
-+ exc_class, exc_instance, traceback = exceptions[0]
-+ raise exc_class(exc_instance).with_traceback(traceback)
-+
-+ if thread.is_alive():
-+ raise TimeoutError(f"Function didn't complete within {timeout} seconds")
-+
-+ def test_scraping_termination_missing_prev(self):
-+ """Test scraping always terminates, even if the page's prev link is missing."""
-+
-+ def scrape_two_pages():
-+ scraper = TelegramChannelScraper('WLM_USA_TEXAS?before=3766')
-+ items = list()
-+ num_items_on_page = 20
-+ for item in scraper.get_items():
-+ items.append(item)
-+ if len(items) > 2 * num_items_on_page:
-+ break
-+
-+ self.execute_with_timeout(scrape_two_pages)
-+
-+ def test_scraping_termination_small_post_count(self):
-+ """Test scraping always terminates, even with small number of posts. This channel has only 28."""
-+
-+ def scrape_small_channel():
-+ scraper = TelegramChannelScraper('AKCPB')
-+ items = list(scraper.get_items())
-+ return items
-+
-+ self.execute_with_timeout(scrape_small_channel)
-+
-+ def test_scraping_termination_channels_without_post_id_one(self):
-+ """Test scraping gracefully handles channels missing a post where id=1."""
-+
-+ def scrape_empty_page():
-+ scraper = TelegramChannelScraper('BREAKDCODE?before=3')
-+ for _ in scraper.get_items():
-+ pass
-+
-+ self.execute_with_timeout(scrape_empty_page)
-+
-+ def test_media_order_preservation(self):
-+ """Test scraped media appears in the same order as in the post."""
-+ scraper = TelegramChannelScraper('nexta_live?before=43103')
-+ item = next(scraper.get_items(), None)
-+ self.assertIsNotNone(item, "Failed to scrape any posts.")
-+ self.assertEqual(item.url, "https://t.me/s/nexta_live/43102")
-+
-+ # Directly validate the types of the objects in the media array
-+ expected_types = [Video, Photo, Video] # Adjust based on expected types
-+ actual_types = [type(media) for media in item.media] if item.media else []
-+
-+ self.assertEqual(actual_types, expected_types, "Media did not appear in the expected order.")
-+
-+
-+if __name__ == '__main__':
-+ unittest.main()
-
-From 7d061cb5279e153f829340f848bc4ba01d716f26 Mon Sep 17 00:00:00 2001
-From: John O'Sullivan <j.osullivan42@gmail.com>
-Date: Thu, 14 Mar 2024 01:55:16 -0400
-Subject: [PATCH 7/8] Add docstring saying suite should run by directly running
- file
-
----
- snscrape/modules/telegram.py | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
-index 7a85cb58..c6e0b0ee 100644
---- a/snscrape/modules/telegram.py
-+++ b/snscrape/modules/telegram.py
-@@ -336,7 +336,9 @@ def _parse_video_message(videoPlayer):
- mKwargs['duration'] = _durationStrToSeconds(durationStr)
- return cls(**mKwargs)
-
-+
- class TestTelegramChannelScraper(unittest.TestCase):
-+ """Run suite by directly calling this file."""
-
- @staticmethod
- def execute_with_timeout(func, timeout=10):
-
-From 9309b1b01c6db15862809623e2c5adddecd894be Mon Sep 17 00:00:00 2001
-From: John O'Sullivan <j.osullivan42@gmail.com>
-Date: Thu, 14 Mar 2024 02:00:50 -0400
-Subject: [PATCH 8/8] Correct some inaccurate test descriptions
-
----
- snscrape/modules/telegram.py | 13 +++++++------
- 1 file changed, 7 insertions(+), 6 deletions(-)
-
-diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
-index c6e0b0ee..dbf0f9b3 100644
---- a/snscrape/modules/telegram.py
-+++ b/snscrape/modules/telegram.py
-@@ -338,7 +338,7 @@ def _parse_video_message(videoPlayer):
-
-
- class TestTelegramChannelScraper(unittest.TestCase):
-- """Run suite by directly calling this file."""
-+ """Run suite by directly running this file."""
-
- @staticmethod
- def execute_with_timeout(func, timeout=10):
-@@ -383,7 +383,7 @@ def scrape_two_pages():
- self.execute_with_timeout(scrape_two_pages)
-
- def test_scraping_termination_small_post_count(self):
-- """Test scraping always terminates, even with small number of posts. This channel has only 28."""
-+ """Test scraping always terminates, even with small number of posts. This channel's highest ID is 28."""
-
- def scrape_small_channel():
- scraper = TelegramChannelScraper('AKCPB')
-@@ -392,8 +392,8 @@ def scrape_small_channel():
-
- self.execute_with_timeout(scrape_small_channel)
-
-- def test_scraping_termination_channels_without_post_id_one(self):
-- """Test scraping gracefully handles channels missing a post where id=1."""
-+ def test_scraping_termination_pages_without_posts(self):
-+ """Test scraping gracefully handles pages without any posts."""
-
- def scrape_empty_page():
- scraper = TelegramChannelScraper('BREAKDCODE?before=3')
-@@ -407,10 +407,11 @@ def test_media_order_preservation(self):
- scraper = TelegramChannelScraper('nexta_live?before=43103')
- item = next(scraper.get_items(), None)
- self.assertIsNotNone(item, "Failed to scrape any posts.")
-+
-+ # This particular post is known to include media [Video, Photo, Video]
- self.assertEqual(item.url, "https://t.me/s/nexta_live/43102")
-
-- # Directly validate the types of the objects in the media array
-- expected_types = [Video, Photo, Video] # Adjust based on expected types
-+ expected_types = [Video, Photo, Video]
- actual_types = [type(media) for media in item.media] if item.media else []
-
- self.assertEqual(actual_types, expected_types, "Media did not appear in the expected order.")