From 00239388e3096277a55271a8786b4b5d6d2bec84 Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Thu, 18 Jan 2024 11:37:32 -0500
Subject: [PATCH 1/8] WIP: Fixed 2.5 out of 5 issues mentioned in PR

---
 snscrape/base.py             |  1 +
 snscrape/modules/telegram.py | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/snscrape/base.py b/snscrape/base.py
index c9e75d9d..5ce5e1da 100644
--- a/snscrape/base.py
+++ b/snscrape/base.py
@@ -193,6 +193,7 @@ def _request(self, method, url, params = None, data = None, headers = None, time
 			# The request is newly prepared on each retry because of potential cookie updates.
 			req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
 			environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
+			_logger.info("Hey there, I'm in here")
 			_logger.info(f'Retrieving {req.url}')
 			_logger.debug(f'... with headers: {headers!r}')
 			if data:
diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index 4e977656..54345d96 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -196,6 +196,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
 				}
 				timeTag = videoPlayer.find('time')
 				if timeTag is None:
+					_logger.warning(f'Could not find duration for video or GIF at {url}')
 					cls = Gif
 				else:
 					cls = Video
@@ -219,8 +220,6 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
 					else:
 						_logger.warning(f'Could not process link preview image on {url}')
 				linkPreview = LinkPreview(**kwargs)
-				if kwargs['href'] in outlinks:
-					outlinks.remove(kwargs['href'])
 
 			viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
 			views = None if viewsSpan is None else _parse_num(viewsSpan.text)
@@ -239,13 +238,14 @@ def get_items(self):
 			return
 		nextPageUrl = ''
 		while True:
+			print("About to yield from get_items")
 			yield from self._soup_to_items(soup, r.url)
-			try:
-				if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1':
+			dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)
+			if dateElt and 'href' in dateElt.attrs:
+				urlPieces = dateElt['href'].split('/')
+				if urlPieces and urlPieces[-1] == '1':
 					# if message 1 is the first message in the page, terminate scraping
 					break
-			except:
-				pass
 			pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
 			if not pageLink:
 				# some pages are missing a "tme_messages_more" tag, causing early termination

From 670905fedb64656b94c6fb920c8628d318171b64 Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Thu, 18 Jan 2024 11:46:46 -0500
Subject: [PATCH 2/8] Remove test log statement, add link to example GIF

---
 snscrape/base.py             | 1 -
 snscrape/modules/telegram.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/snscrape/base.py b/snscrape/base.py
index 5ce5e1da..c9e75d9d 100644
--- a/snscrape/base.py
+++ b/snscrape/base.py
@@ -193,7 +193,6 @@ def _request(self, method, url, params = None, data = None, headers = None, time
 			# The request is newly prepared on each retry because of potential cookie updates.
 			req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
 			environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
-			_logger.info("Hey there, I'm in here")
 			_logger.info(f'Retrieving {req.url}')
 			_logger.debug(f'... with headers: {headers!r}')
 			if data:
diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index 54345d96..01e99318 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -196,7 +196,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
 				}
 				timeTag = videoPlayer.find('time')
 				if timeTag is None:
-					_logger.warning(f'Could not find duration for video or GIF at {url}')
+					# Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3
 					cls = Gif
 				else:
 					cls = Video

From 54df8832f5b5bc3af58c3faf953966a2070a834d Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Thu, 22 Feb 2024 01:06:04 -0500
Subject: [PATCH 3/8] Added media processing into main link loop; using prev
 tag to get page, rather than index math

---
 snscrape/modules/telegram.py | 84 +++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 40 deletions(-)

diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index 01e99318..b4f3d78e 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -152,7 +152,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
 						imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
 						if len(imageUrls) == 1:
 							media.append(Photo(url = imageUrls[0]))
-						continue
+
 				if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']):
 					style = link.attrs.get('style', '')
 					imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
@@ -161,49 +161,23 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
 						# resp = self._get(image[0])
 						# encoded_string = base64.b64encode(resp.content)
 					# Individual photo or video link
-					continue
+
 				if link.text.startswith('@'):
 					mentions.append(link.text.strip('@'))
-					continue
+
 				if link.text.startswith('#'):
 					hashtags.append(link.text.strip('#'))
-					continue
+
+				if 'tgme_widget_message_voice_player' in link.get('class', []):
+					media.append(_parse_voice_message(link))
+					
+				if 'tgme_widget_message_video_player' in link.get('class', []):
+					media.append(_parse_video_message(link))
+
 				href = urllib.parse.urljoin(pageUrl, link['href'])
 				if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl):
 					outlinks.append(href)
 
-			for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
-				audioUrl = voicePlayer.find('audio')['src']
-				durationStr = voicePlayer.find('time').text
-				duration = _durationStrToSeconds(durationStr)
-				barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
-
-				media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))
-
-			for videoPlayer in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
-				iTag = videoPlayer.find('i')
-				if iTag is None:
-					videoUrl = None 
-					videoThumbnailUrl = None
-				else:
-					style = iTag['style']
-					videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0]
-					videoTag = videoPlayer.find('video')
-					videoUrl = None if videoTag is None else videoTag['src']
-				mKwargs = {
-					'thumbnailUrl': videoThumbnailUrl,
-					'url': videoUrl,
-				}
-				timeTag = videoPlayer.find('time')
-				if timeTag is None:
-					# Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3
-					cls = Gif
-				else:
-					cls = Video
-					durationStr = videoPlayer.find('time').text
-					mKwargs['duration'] = _durationStrToSeconds(durationStr)
-				media.append(cls(**mKwargs))
-
 			linkPreview = None
 			if (linkPreviewA := post.find('a', class_ = 'tgme_widget_message_link_preview')):
 				kwargs = {}
@@ -250,10 +224,10 @@ def get_items(self):
 			if not pageLink:
 				# some pages are missing a "tme_messages_more" tag, causing early termination
 				if '=' not in nextPageUrl:
-					nextPageUrl =  soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href']
-				nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20
+					nextPageUrl = soup.find('link', attrs = {'rel': 'prev'}, href = True)['href']
+				nextPostIndex = int(nextPageUrl.split('=')[-1])
 				if nextPostIndex > 20:
-					pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'}
+					pageLink = {'href': nextPageUrl}
 				else:
 					break
 			nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
@@ -333,4 +307,34 @@ def _telegramResponseOkCallback(r):
 	if r.status_code == 200:
 		return (True, None)
 	return (False, f'{r.status_code=}')
-	
\ No newline at end of file
+	
+def _parse_voice_message(voicePlayer):
+	audioUrl = voicePlayer.find('audio')['src']
+	durationStr = voicePlayer.find('time').text
+	duration = _durationStrToSeconds(durationStr)
+	barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
+	return VoiceMessage(url = audioUrl, duration = duration, bars = barHeights)
+
+def _parse_video_message(videoPlayer):
+	iTag = videoPlayer.find('i')
+	if iTag is None:
+		videoUrl = None 
+		videoThumbnailUrl = None
+	else:
+		style = iTag['style']
+		videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0]
+		videoTag = videoPlayer.find('video')
+		videoUrl = None if videoTag is None else videoTag['src']
+	mKwargs = {
+		'thumbnailUrl': videoThumbnailUrl,
+		'url': videoUrl,
+	}
+	timeTag = videoPlayer.find('time')
+	if timeTag is None:
+		# Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3
+		cls = Gif
+	else:
+		cls = Video
+		durationStr = videoPlayer.find('time').text
+		mKwargs['duration'] = _durationStrToSeconds(durationStr)
+	return cls(**mKwargs)
\ No newline at end of file

From 2dfd1542f19bbadad603e00e61712943542fbfe1 Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Thu, 22 Feb 2024 01:07:46 -0500
Subject: [PATCH 4/8] Forgot to remove a test log

---
 snscrape/modules/telegram.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index b4f3d78e..8f6d18d7 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -212,7 +212,6 @@ def get_items(self):
 			return
 		nextPageUrl = ''
 		while True:
-			print("About to yield from get_items")
 			yield from self._soup_to_items(soup, r.url)
 			dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)
 			if dateElt and 'href' in dateElt.attrs:

From a93f6a3fad0d19209a49c7b730fea73659743774 Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Fri, 1 Mar 2024 12:51:26 -0500
Subject: [PATCH 5/8] Applying trislee's suggested fix for getting nextPageUrl

---
 snscrape/modules/telegram.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index 8f6d18d7..ac0feef8 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -219,17 +219,11 @@ def get_items(self):
 				if urlPieces and urlPieces[-1] == '1':
 					# if message 1 is the first message in the page, terminate scraping
 					break
-			pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
-			if not pageLink:
-				# some pages are missing a "tme_messages_more" tag, causing early termination
-				if '=' not in nextPageUrl:
-					nextPageUrl = soup.find('link', attrs = {'rel': 'prev'}, href = True)['href']
-				nextPostIndex = int(nextPageUrl.split('=')[-1])
-				if nextPostIndex > 20:
-					pageLink = {'href': nextPageUrl}
-				else:
-					break
-			nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
+			if pageLink := soup.find('link', attrs = {'rel': 'prev'}, href = True):
+				nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
+			else:
+				nextPostIndex = int(soup.find('div', attrs = {'class': 'tgme_widget_message', 'data-post': True})["data-post"].split("/")[-1])
+				nextPageUrl = urllib.parse.urljoin(r.url, r.url.split('?')[0] + f'?before={nextPostIndex}')
 			r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback)
 			if r.status_code != 200:
 				raise snscrape.base.ScraperException(f'Got status code {r.status_code}')

From a542aa57598f94f69fd7b69789e97045e92133da Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Thu, 14 Mar 2024 01:50:38 -0400
Subject: [PATCH 6/8] Ensured termination on channels w/o an id=1 post, wrote
 test cases to prevent regression

---
 snscrape/modules/telegram.py | 87 +++++++++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 1 deletion(-)

diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index ac0feef8..7a85cb58 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -9,6 +9,8 @@
 import snscrape.base
 import typing
 import urllib.parse
+import unittest
+import threading
 
 _logger = logging.getLogger(__name__)
 _SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
@@ -212,6 +214,8 @@ def get_items(self):
 			return
 		nextPageUrl = ''
 		while True:
+			if soup.find("div", class_ = "tme_no_messages_found"):
+				break
 			yield from self._soup_to_items(soup, r.url)
 			dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)
 			if dateElt and 'href' in dateElt.attrs:
@@ -330,4 +334,85 @@ def _parse_video_message(videoPlayer):
 		cls = Video
 		durationStr = videoPlayer.find('time').text
 		mKwargs['duration'] = _durationStrToSeconds(durationStr)
-	return cls(**mKwargs)
\ No newline at end of file
+	return cls(**mKwargs)
+
+class TestTelegramChannelScraper(unittest.TestCase):
+
+	@staticmethod
+	def execute_with_timeout(func, timeout=10):
+		"""
+		Executes a function in a separate thread and enforces a timeout.
+		If provided function throws an error, it's re-raised in main thread.
+		Used to detect infinite loops in finite time, works cross-platform.
+		
+		:param func: The function to execute. This function should accept no arguments.
+		:param timeout: The timeout in seconds.
+		"""
+		exceptions=[]
+		def func_passing_exceptions():
+			try:
+				func()
+			except Exception as e:
+				exceptions.append((e.__class__, e, e.__traceback__))
+
+		thread = threading.Thread(target=func_passing_exceptions)
+		thread.start()
+		thread.join(timeout=timeout)
+
+		if exceptions:
+			exc_class, exc_instance, traceback = exceptions[0]
+			raise exc_class(exc_instance).with_traceback(traceback)
+		
+		if thread.is_alive():
+			raise TimeoutError(f"Function didn't complete within {timeout} seconds")
+
+	def test_scraping_termination_missing_prev(self):
+		"""Test scraping always terminates, even if the page's prev link is missing."""
+
+		def scrape_two_pages():
+			scraper = TelegramChannelScraper('WLM_USA_TEXAS?before=3766')
+			items = list()
+			num_items_on_page = 20
+			for item in scraper.get_items():
+				items.append(item)
+				if len(items) > 2 * num_items_on_page:
+					break
+		
+		self.execute_with_timeout(scrape_two_pages)
+
+	def test_scraping_termination_small_post_count(self):
+		"""Test scraping always terminates, even with small number of posts. This channel has only 28."""
+
+		def scrape_small_channel():
+			scraper = TelegramChannelScraper('AKCPB')
+			items = list(scraper.get_items())
+			return items
+		
+		self.execute_with_timeout(scrape_small_channel)
+
+	def test_scraping_termination_channels_without_post_id_one(self):
+		"""Test scraping gracefully handles channels missing a post where id=1."""
+
+		def scrape_empty_page():
+			scraper = TelegramChannelScraper('BREAKDCODE?before=3')
+			for _ in scraper.get_items():
+				pass
+		
+		self.execute_with_timeout(scrape_empty_page)
+
+	def test_media_order_preservation(self):
+		"""Test scraped media appears in the same order as in the post."""
+		scraper = TelegramChannelScraper('nexta_live?before=43103')
+		item = next(scraper.get_items(), None)
+		self.assertIsNotNone(item, "Failed to scrape any posts.")
+		self.assertEqual(item.url, "https://t.me/s/nexta_live/43102")
+
+		# Directly validate the types of the objects in the media array
+		expected_types = [Video, Photo, Video]  # Adjust based on expected types
+		actual_types = [type(media) for media in item.media] if item.media else []
+		
+		self.assertEqual(actual_types, expected_types, "Media did not appear in the expected order.")
+
+
+if __name__ == '__main__':
+	unittest.main()

From 7d061cb5279e153f829340f848bc4ba01d716f26 Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Thu, 14 Mar 2024 01:55:16 -0400
Subject: [PATCH 7/8] Add docstring saying suite should run by directly running
 file

---
 snscrape/modules/telegram.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index 7a85cb58..c6e0b0ee 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -336,7 +336,9 @@ def _parse_video_message(videoPlayer):
 		mKwargs['duration'] = _durationStrToSeconds(durationStr)
 	return cls(**mKwargs)
 
+
 class TestTelegramChannelScraper(unittest.TestCase):
+	"""Run suite by directly calling this file."""
 
 	@staticmethod
 	def execute_with_timeout(func, timeout=10):

From 9309b1b01c6db15862809623e2c5adddecd894be Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Thu, 14 Mar 2024 02:00:50 -0400
Subject: [PATCH 8/8] Correct some inaccurate test descriptions

---
 snscrape/modules/telegram.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index c6e0b0ee..dbf0f9b3 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -338,7 +338,7 @@ def _parse_video_message(videoPlayer):
 
 
 class TestTelegramChannelScraper(unittest.TestCase):
-	"""Run suite by directly calling this file."""
+	"""Run suite by directly running this file."""
 
 	@staticmethod
 	def execute_with_timeout(func, timeout=10):
@@ -383,7 +383,7 @@ def scrape_two_pages():
 		self.execute_with_timeout(scrape_two_pages)
 
 	def test_scraping_termination_small_post_count(self):
-		"""Test scraping always terminates, even with small number of posts. This channel has only 28."""
+		"""Test scraping always terminates, even with small number of posts. This channel's highest ID is 28."""
 
 		def scrape_small_channel():
 			scraper = TelegramChannelScraper('AKCPB')
@@ -392,8 +392,8 @@ def scrape_small_channel():
 		
 		self.execute_with_timeout(scrape_small_channel)
 
-	def test_scraping_termination_channels_without_post_id_one(self):
-		"""Test scraping gracefully handles channels missing a post where id=1."""
+	def test_scraping_termination_pages_without_posts(self):
+		"""Test scraping gracefully handles pages without any posts."""
 
 		def scrape_empty_page():
 			scraper = TelegramChannelScraper('BREAKDCODE?before=3')
@@ -407,10 +407,11 @@ def test_media_order_preservation(self):
 		scraper = TelegramChannelScraper('nexta_live?before=43103')
 		item = next(scraper.get_items(), None)
 		self.assertIsNotNone(item, "Failed to scrape any posts.")
+
+		# This particular post is known to include media [Video, Photo, Video]
 		self.assertEqual(item.url, "https://t.me/s/nexta_live/43102")
 
-		# Directly validate the types of the objects in the media array
-		expected_types = [Video, Photo, Video]  # Adjust based on expected types
+		expected_types = [Video, Photo, Video]
 		actual_types = [type(media) for media in item.media] if item.media else []
 		
 		self.assertEqual(actual_types, expected_types, "Media did not appear in the expected order.")