1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
|
From 00239388e3096277a55271a8786b4b5d6d2bec84 Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Thu, 18 Jan 2024 11:37:32 -0500
Subject: [PATCH 1/8] WIP: Fixed 2.5 out of 5 issues mentioned in PR
---
snscrape/base.py | 1 +
snscrape/modules/telegram.py | 12 ++++++------
2 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/snscrape/base.py b/snscrape/base.py
index c9e75d9d..5ce5e1da 100644
--- a/snscrape/base.py
+++ b/snscrape/base.py
@@ -193,6 +193,7 @@ def _request(self, method, url, params = None, data = None, headers = None, time
# The request is newly prepared on each retry because of potential cookie updates.
req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
+ _logger.info("Hey there, I'm in here")
_logger.info(f'Retrieving {req.url}')
_logger.debug(f'... with headers: {headers!r}')
if data:
diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index 4e977656..54345d96 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -196,6 +196,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
}
timeTag = videoPlayer.find('time')
if timeTag is None:
+ _logger.warning(f'Could not find duration for video or GIF at {url}')
cls = Gif
else:
cls = Video
@@ -219,8 +220,6 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
else:
_logger.warning(f'Could not process link preview image on {url}')
linkPreview = LinkPreview(**kwargs)
- if kwargs['href'] in outlinks:
- outlinks.remove(kwargs['href'])
viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
views = None if viewsSpan is None else _parse_num(viewsSpan.text)
@@ -239,13 +238,14 @@ def get_items(self):
return
nextPageUrl = ''
while True:
+ print("About to yield from get_items")
yield from self._soup_to_items(soup, r.url)
- try:
- if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1':
+ dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)
+ if dateElt and 'href' in dateElt.attrs:
+ urlPieces = dateElt['href'].split('/')
+ if urlPieces and urlPieces[-1] == '1':
# if message 1 is the first message in the page, terminate scraping
break
- except:
- pass
pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
if not pageLink:
# some pages are missing a "tme_messages_more" tag, causing early termination
From 670905fedb64656b94c6fb920c8628d318171b64 Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Thu, 18 Jan 2024 11:46:46 -0500
Subject: [PATCH 2/8] Remove test log statement, add link to example GIF
---
snscrape/base.py | 1 -
snscrape/modules/telegram.py | 2 +-
2 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/snscrape/base.py b/snscrape/base.py
index 5ce5e1da..c9e75d9d 100644
--- a/snscrape/base.py
+++ b/snscrape/base.py
@@ -193,7 +193,6 @@ def _request(self, method, url, params = None, data = None, headers = None, time
# The request is newly prepared on each retry because of potential cookie updates.
req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
- _logger.info("Hey there, I'm in here")
_logger.info(f'Retrieving {req.url}')
_logger.debug(f'... with headers: {headers!r}')
if data:
diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index 54345d96..01e99318 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -196,7 +196,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
}
timeTag = videoPlayer.find('time')
if timeTag is None:
- _logger.warning(f'Could not find duration for video or GIF at {url}')
+ # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3
cls = Gif
else:
cls = Video
From 54df8832f5b5bc3af58c3faf953966a2070a834d Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Thu, 22 Feb 2024 01:06:04 -0500
Subject: [PATCH 3/8] Added media processing into main link loop; using prev
tag to get page, rather than index math
---
snscrape/modules/telegram.py | 84 +++++++++++++++++++-----------------
1 file changed, 44 insertions(+), 40 deletions(-)
diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index 01e99318..b4f3d78e 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -152,7 +152,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
if len(imageUrls) == 1:
media.append(Photo(url = imageUrls[0]))
- continue
+
if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']):
style = link.attrs.get('style', '')
imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
@@ -161,49 +161,23 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
# resp = self._get(image[0])
# encoded_string = base64.b64encode(resp.content)
# Individual photo or video link
- continue
+
if link.text.startswith('@'):
mentions.append(link.text.strip('@'))
- continue
+
if link.text.startswith('#'):
hashtags.append(link.text.strip('#'))
- continue
+
+ if 'tgme_widget_message_voice_player' in link.get('class', []):
+ media.append(_parse_voice_message(link))
+
+ if 'tgme_widget_message_video_player' in link.get('class', []):
+ media.append(_parse_video_message(link))
+
href = urllib.parse.urljoin(pageUrl, link['href'])
if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl):
outlinks.append(href)
- for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
- audioUrl = voicePlayer.find('audio')['src']
- durationStr = voicePlayer.find('time').text
- duration = _durationStrToSeconds(durationStr)
- barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
-
- media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))
-
- for videoPlayer in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
- iTag = videoPlayer.find('i')
- if iTag is None:
- videoUrl = None
- videoThumbnailUrl = None
- else:
- style = iTag['style']
- videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0]
- videoTag = videoPlayer.find('video')
- videoUrl = None if videoTag is None else videoTag['src']
- mKwargs = {
- 'thumbnailUrl': videoThumbnailUrl,
- 'url': videoUrl,
- }
- timeTag = videoPlayer.find('time')
- if timeTag is None:
- # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3
- cls = Gif
- else:
- cls = Video
- durationStr = videoPlayer.find('time').text
- mKwargs['duration'] = _durationStrToSeconds(durationStr)
- media.append(cls(**mKwargs))
-
linkPreview = None
if (linkPreviewA := post.find('a', class_ = 'tgme_widget_message_link_preview')):
kwargs = {}
@@ -250,10 +224,10 @@ def get_items(self):
if not pageLink:
# some pages are missing a "tme_messages_more" tag, causing early termination
if '=' not in nextPageUrl:
- nextPageUrl = soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href']
- nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20
+ nextPageUrl = soup.find('link', attrs = {'rel': 'prev'}, href = True)['href']
+ nextPostIndex = int(nextPageUrl.split('=')[-1])
if nextPostIndex > 20:
- pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'}
+ pageLink = {'href': nextPageUrl}
else:
break
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
@@ -333,4 +307,34 @@ def _telegramResponseOkCallback(r):
if r.status_code == 200:
return (True, None)
return (False, f'{r.status_code=}')
-
\ No newline at end of file
+
+def _parse_voice_message(voicePlayer):
+ audioUrl = voicePlayer.find('audio')['src']
+ durationStr = voicePlayer.find('time').text
+ duration = _durationStrToSeconds(durationStr)
+ barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
+ return VoiceMessage(url = audioUrl, duration = duration, bars = barHeights)
+
+def _parse_video_message(videoPlayer):
+ iTag = videoPlayer.find('i')
+ if iTag is None:
+ videoUrl = None
+ videoThumbnailUrl = None
+ else:
+ style = iTag['style']
+ videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0]
+ videoTag = videoPlayer.find('video')
+ videoUrl = None if videoTag is None else videoTag['src']
+ mKwargs = {
+ 'thumbnailUrl': videoThumbnailUrl,
+ 'url': videoUrl,
+ }
+ timeTag = videoPlayer.find('time')
+ if timeTag is None:
+ # Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3
+ cls = Gif
+ else:
+ cls = Video
+ durationStr = videoPlayer.find('time').text
+ mKwargs['duration'] = _durationStrToSeconds(durationStr)
+ return cls(**mKwargs)
\ No newline at end of file
From 2dfd1542f19bbadad603e00e61712943542fbfe1 Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Thu, 22 Feb 2024 01:07:46 -0500
Subject: [PATCH 4/8] Forgot to remove a test log
---
snscrape/modules/telegram.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index b4f3d78e..8f6d18d7 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -212,7 +212,6 @@ def get_items(self):
return
nextPageUrl = ''
while True:
- print("About to yield from get_items")
yield from self._soup_to_items(soup, r.url)
dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)
if dateElt and 'href' in dateElt.attrs:
From a93f6a3fad0d19209a49c7b730fea73659743774 Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Fri, 1 Mar 2024 12:51:26 -0500
Subject: [PATCH 5/8] Applying trislee's suggested fix for getting nextPageUrl
---
snscrape/modules/telegram.py | 16 +++++-----------
1 file changed, 5 insertions(+), 11 deletions(-)
diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index 8f6d18d7..ac0feef8 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -219,17 +219,11 @@ def get_items(self):
if urlPieces and urlPieces[-1] == '1':
# if message 1 is the first message in the page, terminate scraping
break
- pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
- if not pageLink:
- # some pages are missing a "tme_messages_more" tag, causing early termination
- if '=' not in nextPageUrl:
- nextPageUrl = soup.find('link', attrs = {'rel': 'prev'}, href = True)['href']
- nextPostIndex = int(nextPageUrl.split('=')[-1])
- if nextPostIndex > 20:
- pageLink = {'href': nextPageUrl}
- else:
- break
- nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
+ if pageLink := soup.find('link', attrs = {'rel': 'prev'}, href = True):
+ nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
+ else:
+ nextPostIndex = int(soup.find('div', attrs = {'class': 'tgme_widget_message', 'data-post': True})["data-post"].split("/")[-1])
+ nextPageUrl = urllib.parse.urljoin(r.url, r.url.split('?')[0] + f'?before={nextPostIndex}')
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback)
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
From a542aa57598f94f69fd7b69789e97045e92133da Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Thu, 14 Mar 2024 01:50:38 -0400
Subject: [PATCH 6/8] Ensured termination on channels w/o an id=1 post, wrote
test cases to prevent regression
---
snscrape/modules/telegram.py | 87 +++++++++++++++++++++++++++++++++++-
1 file changed, 86 insertions(+), 1 deletion(-)
diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index ac0feef8..7a85cb58 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -9,6 +9,8 @@
import snscrape.base
import typing
import urllib.parse
+import unittest
+import threading
_logger = logging.getLogger(__name__)
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
@@ -212,6 +214,8 @@ def get_items(self):
return
nextPageUrl = ''
while True:
+ if soup.find("div", class_ = "tme_no_messages_found"):
+ break
yield from self._soup_to_items(soup, r.url)
dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)
if dateElt and 'href' in dateElt.attrs:
@@ -330,4 +334,85 @@ def _parse_video_message(videoPlayer):
cls = Video
durationStr = videoPlayer.find('time').text
mKwargs['duration'] = _durationStrToSeconds(durationStr)
- return cls(**mKwargs)
\ No newline at end of file
+ return cls(**mKwargs)
+
+class TestTelegramChannelScraper(unittest.TestCase):
+
+ @staticmethod
+ def execute_with_timeout(func, timeout=10):
+ """
+ Executes a function in a separate thread and enforces a timeout.
+ If provided function throws an error, it's re-raised in main thread.
+ Used to detect infinite loops in finite time, works cross-platform.
+
+ :param func: The function to execute. This function should accept no arguments.
+ :param timeout: The timeout in seconds.
+ """
+ exceptions=[]
+ def func_passing_exceptions():
+ try:
+ func()
+ except Exception as e:
+ exceptions.append((e.__class__, e, e.__traceback__))
+
+ thread = threading.Thread(target=func_passing_exceptions)
+ thread.start()
+ thread.join(timeout=timeout)
+
+ if exceptions:
+ exc_class, exc_instance, traceback = exceptions[0]
+ raise exc_class(exc_instance).with_traceback(traceback)
+
+ if thread.is_alive():
+ raise TimeoutError(f"Function didn't complete within {timeout} seconds")
+
+ def test_scraping_termination_missing_prev(self):
+ """Test scraping always terminates, even if the page's prev link is missing."""
+
+ def scrape_two_pages():
+ scraper = TelegramChannelScraper('WLM_USA_TEXAS?before=3766')
+ items = list()
+ num_items_on_page = 20
+ for item in scraper.get_items():
+ items.append(item)
+ if len(items) > 2 * num_items_on_page:
+ break
+
+ self.execute_with_timeout(scrape_two_pages)
+
+ def test_scraping_termination_small_post_count(self):
+ """Test scraping always terminates, even with small number of posts. This channel has only 28."""
+
+ def scrape_small_channel():
+ scraper = TelegramChannelScraper('AKCPB')
+ items = list(scraper.get_items())
+ return items
+
+ self.execute_with_timeout(scrape_small_channel)
+
+ def test_scraping_termination_channels_without_post_id_one(self):
+ """Test scraping gracefully handles channels missing a post where id=1."""
+
+ def scrape_empty_page():
+ scraper = TelegramChannelScraper('BREAKDCODE?before=3')
+ for _ in scraper.get_items():
+ pass
+
+ self.execute_with_timeout(scrape_empty_page)
+
+ def test_media_order_preservation(self):
+ """Test scraped media appears in the same order as in the post."""
+ scraper = TelegramChannelScraper('nexta_live?before=43103')
+ item = next(scraper.get_items(), None)
+ self.assertIsNotNone(item, "Failed to scrape any posts.")
+ self.assertEqual(item.url, "https://t.me/s/nexta_live/43102")
+
+ # Directly validate the types of the objects in the media array
+ expected_types = [Video, Photo, Video] # Adjust based on expected types
+ actual_types = [type(media) for media in item.media] if item.media else []
+
+ self.assertEqual(actual_types, expected_types, "Media did not appear in the expected order.")
+
+
+if __name__ == '__main__':
+ unittest.main()
From 7d061cb5279e153f829340f848bc4ba01d716f26 Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Thu, 14 Mar 2024 01:55:16 -0400
Subject: [PATCH 7/8] Add docstring saying suite should run by directly running
file
---
snscrape/modules/telegram.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index 7a85cb58..c6e0b0ee 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -336,7 +336,9 @@ def _parse_video_message(videoPlayer):
mKwargs['duration'] = _durationStrToSeconds(durationStr)
return cls(**mKwargs)
+
class TestTelegramChannelScraper(unittest.TestCase):
+ """Run suite by directly calling this file."""
@staticmethod
def execute_with_timeout(func, timeout=10):
From 9309b1b01c6db15862809623e2c5adddecd894be Mon Sep 17 00:00:00 2001
From: John O'Sullivan <j.osullivan42@gmail.com>
Date: Thu, 14 Mar 2024 02:00:50 -0400
Subject: [PATCH 8/8] Correct some inaccurate test descriptions
---
snscrape/modules/telegram.py | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index c6e0b0ee..dbf0f9b3 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -338,7 +338,7 @@ def _parse_video_message(videoPlayer):
class TestTelegramChannelScraper(unittest.TestCase):
- """Run suite by directly calling this file."""
+ """Run suite by directly running this file."""
@staticmethod
def execute_with_timeout(func, timeout=10):
@@ -383,7 +383,7 @@ def scrape_two_pages():
self.execute_with_timeout(scrape_two_pages)
def test_scraping_termination_small_post_count(self):
- """Test scraping always terminates, even with small number of posts. This channel has only 28."""
+ """Test scraping always terminates, even with small number of posts. This channel's highest ID is 28."""
def scrape_small_channel():
scraper = TelegramChannelScraper('AKCPB')
@@ -392,8 +392,8 @@ def scrape_small_channel():
self.execute_with_timeout(scrape_small_channel)
- def test_scraping_termination_channels_without_post_id_one(self):
- """Test scraping gracefully handles channels missing a post where id=1."""
+ def test_scraping_termination_pages_without_posts(self):
+ """Test scraping gracefully handles pages without any posts."""
def scrape_empty_page():
scraper = TelegramChannelScraper('BREAKDCODE?before=3')
@@ -407,10 +407,11 @@ def test_media_order_preservation(self):
scraper = TelegramChannelScraper('nexta_live?before=43103')
item = next(scraper.get_items(), None)
self.assertIsNotNone(item, "Failed to scrape any posts.")
+
+ # This particular post is known to include media [Video, Photo, Video]
self.assertEqual(item.url, "https://t.me/s/nexta_live/43102")
- # Directly validate the types of the objects in the media array
- expected_types = [Video, Photo, Video] # Adjust based on expected types
+ expected_types = [Video, Photo, Video]
actual_types = [type(media) for media in item.media] if item.media else []
self.assertEqual(actual_types, expected_types, "Media did not appear in the expected order.")
|