From b50b9b8980906e4dca7fe4e1c61f56a20801a3f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=93=D0=BB=D0=B5=D0=B1?= Date: Sun, 13 Apr 2025 18:19:39 +0300 Subject: [PATCH] feat: init tg nodes, init web view --- Dockerfile | 6 + docker-compose.yaml | 27 +++ patcher.py | 402 --------------------------------------- requirements.txt | 5 +- runner.sh | 14 -- tg/tg_crawler.py | 73 +++++++ tg/tg_node_0.py | 34 ++++ tg/tg_node_1.py | 34 ++++ web/app.py | 34 ++++ web/templates/index.html | 368 +++++++++++++++++++++++++++++++++++ web/templates/logs.html | 246 ++++++++++++++++++++++++ 11 files changed, 826 insertions(+), 417 deletions(-) create mode 100644 Dockerfile create mode 100644 docker-compose.yaml delete mode 100644 patcher.py delete mode 100644 runner.sh create mode 100644 tg/tg_crawler.py create mode 100644 tg/tg_node_0.py create mode 100644 tg/tg_node_1.py create mode 100644 web/app.py create mode 100644 web/templates/index.html create mode 100644 web/templates/logs.html diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..f0c87b6 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.12-slim +WORKDIR /app +COPY . /app +RUN pip install --no-cache-dir -r requirements.txt +EXPOSE 5000 +CMD ["python", "web/app.py"] diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..e754039 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,27 @@ +version: '3.8' + +services: + web: + build: . + container_name: tg-monitor + ports: + - "5000:5000" + volumes: + - .:/app + command: python web/app.py + depends_on: + - tg_node_0 + - tg_node_1 + + tg_node_0: + build: . + container_name: tg-node-0 + command: python tg/node_tg_0.py + + tg_node_1: + build: . + container_name: tg-node-1 + command: python tg/node_tg_1.py + +volumes: + pg_data: diff --git a/patcher.py b/patcher.py deleted file mode 100644 index 99800dc..0000000 --- a/patcher.py +++ /dev/null @@ -1,402 +0,0 @@ -#!/usr/bin/env python3 -# this module is part of undetected_chromedriver - -from distutils.version import LooseVersion -import io -import json -import logging -import os -import pathlib -import platform -import random -import re -import shutil -import string -import sys -import time -from urllib.request import urlopen -from urllib.request import urlretrieve -import zipfile -from multiprocessing import Lock -import secrets - -logger = logging.getLogger(__name__) - -IS_POSIX = sys.platform.startswith(("darwin", "cygwin", "linux", "linux2")) - - -class Patcher(object): - lock = Lock() - exe_name = "chromedriver%s" - - platform = sys.platform - if platform.endswith("win32"): - d = "~/appdata/roaming/undetected_chromedriver" - elif "LAMBDA_TASK_ROOT" in os.environ: - d = "/tmp/undetected_chromedriver" - elif platform.startswith(("linux", "linux2")): - d = "~/.local/share/undetected_chromedriver" - elif platform.endswith("darwin"): - d = "~/Library/Application Support/undetected_chromedriver" - else: - d = "~/.undetected_chromedriver" - data_path = os.path.abspath(os.path.expanduser(d)) - - def __init__( - self, - executable_path=None, - force=False, - version_main: int = 0, - user_multi_procs=False, - ): - """ - Args: - executable_path: None = automatic - a full file path to the chromedriver executable - force: False - terminate processes which are holding lock - version_main: 0 = auto - specify main chrome version (rounded, ex: 82) - """ - self.force = force - self._custom_exe_path = False - prefix = secrets.token_hex(8) - self.user_multi_procs = user_multi_procs - - self.is_old_chromedriver = version_main and version_main <= 114 - # Needs to be called before self.exe_name is accessed - self._set_platform_name() - - if not os.path.exists(self.data_path): - os.makedirs(self.data_path, exist_ok=True) - - if not executable_path: - self.executable_path = os.path.join( - self.data_path, "_".join([prefix, self.exe_name]) - ) - - if not IS_POSIX: - if executable_path: - if not executable_path[-4:] == ".exe": - executable_path += ".exe" - - self.zip_path = os.path.join(self.data_path, prefix) - - if not executable_path: - if not self.user_multi_procs: - self.executable_path = os.path.abspath( - os.path.join(".", self.executable_path) - ) - - if executable_path: - self._custom_exe_path = True - self.executable_path = executable_path - - # Set the correct repository to download the Chromedriver from - if self.is_old_chromedriver: - self.url_repo = "https://chromedriver.storage.googleapis.com" - else: - self.url_repo = "https://googlechromelabs.github.io/chrome-for-testing" - - self.version_main = version_main - self.version_full = None - - def _set_platform_name(self): - """ - Set the platform and exe name based on the platform undetected_chromedriver is running on - in order to download the correct chromedriver. - """ - if self.platform.endswith("win32"): - self.platform_name = "win32" - self.exe_name %= ".exe" - if self.platform.endswith(("linux", "linux2")): - self.platform_name = "linux64" - self.exe_name %= "" - if self.platform.endswith("darwin"): - if self.is_old_chromedriver: - self.platform_name = "mac64" - else: - self.platform_name = "mac-x64" - self.exe_name %= "" - - def auto(self, executable_path=None, force=False, version_main=None, _=None): - """ - - Args: - executable_path: - force: - version_main: - - Returns: - - """ - p = pathlib.Path(self.data_path) - if self.user_multi_procs: - with Lock(): - files = list(p.rglob("*chromedriver*")) - most_recent = max(files, key=lambda f: f.stat().st_mtime) - files.remove(most_recent) - list(map(lambda f: f.unlink(), files)) - if self.is_binary_patched(most_recent): - self.executable_path = str(most_recent) - return True - - if executable_path: - self.executable_path = executable_path - self._custom_exe_path = True - - if self._custom_exe_path: - ispatched = self.is_binary_patched(self.executable_path) - if not ispatched: - return self.patch_exe() - else: - return - - if version_main: - self.version_main = version_main - if force is True: - self.force = force - - try: - os.unlink(self.executable_path) - except PermissionError: - if self.force: - self.force_kill_instances(self.executable_path) - return self.auto(force=not self.force) - try: - if self.is_binary_patched(): - # assumes already running AND patched - return True - except PermissionError: - pass - # return False - except FileNotFoundError: - pass - - release = self.fetch_release_number() - self.version_main = release.version[0] - self.version_full = release - self.unzip_package(self.fetch_package()) - return self.patch() - - def driver_binary_in_use(self, path: str = None) -> bool: - """ - naive test to check if a found chromedriver binary is - currently in use - - Args: - path: a string or PathLike object to the binary to check. - if not specified, we check use this object's executable_path - """ - if not path: - path = self.executable_path - p = pathlib.Path(path) - - if not p.exists(): - raise OSError("file does not exist: %s" % p) - try: - with open(p, mode="a+b") as fs: - exc = [] - try: - - fs.seek(0, 0) - except PermissionError as e: - exc.append(e) # since some systems apprently allow seeking - # we conduct another test - try: - fs.readline() - except PermissionError as e: - exc.append(e) - - if exc: - - return True - return False - # ok safe to assume this is in use - except Exception as e: - # logger.exception("whoops ", e) - pass - - def cleanup_unused_files(self): - p = pathlib.Path(self.data_path) - items = list(p.glob("*undetected*")) - for item in items: - try: - item.unlink() - except: - pass - - def patch(self): - self.patch_exe() - return self.is_binary_patched() - - def fetch_release_number(self): - """ - Gets the latest major version available, or the latest major version of self.target_version if set explicitly. - :return: version string - :rtype: LooseVersion - """ - # Endpoint for old versions of Chromedriver (114 and below) - if self.is_old_chromedriver: - path = f"/latest_release_{self.version_main}" - path = path.upper() - logger.debug("getting release number from %s" % path) - return LooseVersion(urlopen(self.url_repo + path).read().decode()) - - # Endpoint for new versions of Chromedriver (115+) - if not self.version_main: - # Fetch the latest version - path = "/last-known-good-versions-with-downloads.json" - logger.debug("getting release number from %s" % path) - with urlopen(self.url_repo + path) as conn: - response = conn.read().decode() - - last_versions = json.loads(response) - return LooseVersion(last_versions["channels"]["Stable"]["version"]) - - # Fetch the latest minor version of the major version provided - path = "/latest-versions-per-milestone-with-downloads.json" - logger.debug("getting release number from %s" % path) - with urlopen(self.url_repo + path) as conn: - response = conn.read().decode() - - major_versions = json.loads(response) - return LooseVersion(major_versions["milestones"][str(self.version_main)]["version"]) - - def parse_exe_version(self): - with io.open(self.executable_path, "rb") as f: - for line in iter(lambda: f.readline(), b""): - match = re.search(rb"platform_handle\x00content\x00([0-9.]*)", line) - if match: - return LooseVersion(match[1].decode()) - - def fetch_package(self): - """ - Downloads ChromeDriver from source - - :return: path to downloaded file - """ - zip_name = f"chromedriver_{self.platform_name}.zip" - if self.is_old_chromedriver: - download_url = "%s/%s/%s" % (self.url_repo, self.version_full.vstring, zip_name) - else: - zip_name = zip_name.replace("_", "-", 1) - download_url = "https://storage.googleapis.com/chrome-for-testing-public/%s/%s/%s" - download_url %= (self.version_full.vstring, self.platform_name, zip_name) - - logger.debug("downloading from %s" % download_url) - return urlretrieve(download_url)[0] - - def unzip_package(self, fp): - """ - Does what it says - - :return: path to unpacked executable - """ - exe_path = self.exe_name - if not self.is_old_chromedriver: - # The new chromedriver unzips into its own folder - zip_name = f"chromedriver-{self.platform_name}" - exe_path = os.path.join(zip_name, self.exe_name) - - logger.debug("unzipping %s" % fp) - try: - os.unlink(self.zip_path) - except (FileNotFoundError, OSError): - pass - - os.makedirs(self.zip_path, mode=0o755, exist_ok=True) - with zipfile.ZipFile(fp, mode="r") as zf: - zf.extractall(self.zip_path) - os.rename(os.path.join(self.zip_path, exe_path), self.executable_path) - os.remove(fp) - shutil.rmtree(self.zip_path) - os.chmod(self.executable_path, 0o755) - return self.executable_path - - @staticmethod - def force_kill_instances(exe_name): - """ - kills running instances. - :param: executable name to kill, may be a path as well - - :return: True on success else False - """ - exe_name = os.path.basename(exe_name) - if IS_POSIX: - r = os.system("kill -f -9 $(pidof %s)" % exe_name) - else: - r = os.system("taskkill /f /im %s" % exe_name) - return not r - - @staticmethod - def gen_random_cdc(): - cdc = random.choices(string.ascii_letters, k=27) - return "".join(cdc).encode() - - def is_binary_patched(self, executable_path=None): - executable_path = executable_path or self.executable_path - try: - with io.open(executable_path, "rb") as fh: - return fh.read().find(b"undetected chromedriver") != -1 - except FileNotFoundError: - return False - - def patch_exe(self): - start = time.perf_counter() - logger.info("patching driver executable %s" % self.executable_path) - with io.open(self.executable_path, "r+b") as fh: - content = fh.read() - # match_injected_codeblock = re.search(rb"{window.*;}", content) - match_injected_codeblock = re.search(rb"\{window\.cdc.*?;\}", content) - if match_injected_codeblock: - target_bytes = match_injected_codeblock[0] - new_target_bytes = ( - b'{console.log("undetected chromedriver 1337!")}'.ljust( - len(target_bytes), b" " - ) - ) - new_content = content.replace(target_bytes, new_target_bytes) - if new_content == content: - logger.warning( - "something went wrong patching the driver binary. could not find injection code block" - ) - else: - logger.debug( - "found block:\n%s\nreplacing with:\n%s" - % (target_bytes, new_target_bytes) - ) - fh.seek(0) - fh.write(new_content) - logger.debug( - "patching took us {:.2f} seconds".format(time.perf_counter() - start) - ) - - def __repr__(self): - return "{0:s}({1:s})".format( - self.__class__.__name__, - self.executable_path, - ) - - def __del__(self): - if self._custom_exe_path: - # if the driver binary is specified by user - # we assume it is important enough to not delete it - return - else: - timeout = 3 # stop trying after this many seconds - t = time.monotonic() - now = lambda: time.monotonic() - while now() - t > timeout: - # we don't want to wait until the end of time - try: - if self.user_multi_procs: - break - os.unlink(self.executable_path) - logger.debug("successfully unlinked %s" % self.executable_path) - break - except (OSError, RuntimeError, PermissionError): - time.sleep(0.01) - continue - except FileNotFoundError: - break diff --git a/requirements.txt b/requirements.txt index 0d8b373..aafe21d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,7 @@ python-dotenv tls-client undetected-chromedriver selenium -grpc_interceptor_headers \ No newline at end of file +grpc_interceptor_headers +telethon +schedule +psycopg2-binary \ No newline at end of file diff --git a/runner.sh b/runner.sh deleted file mode 100644 index 185249e..0000000 --- a/runner.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -TARGET_DIR=$(python -c "import undetected_chromedriver as uc; print(uc.__path__[0])" 2>/dev/null) - -if [ -d "$TARGET_DIR" ]; then - echo "patcher.py в $TARGET_DIR" - cp patcher.py "$TARGET_DIR" -else - echo "undetected_chromedriver не найден" -fi - -pip install -r requirements.txt - -python initiator.py \ No newline at end of file diff --git a/tg/tg_crawler.py b/tg/tg_crawler.py new file mode 100644 index 0000000..eefe1df --- /dev/null +++ b/tg/tg_crawler.py @@ -0,0 +1,73 @@ +import logging +import psycopg2 +from datetime import datetime +from telethon.sync import TelegramClient +from telethon.tl.functions.messages import GetHistoryRequest + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('tg_nodes.log'), + logging.StreamHandler() + ] +) + +class TelegramChannelMonitor: + db_config = None + + def __init__(self, session_name, api_id, api_hash, channel_username, source_name): + self.session_name = session_name + self.api_id = api_id + self.api_hash = api_hash + self.channel_username = channel_username + self.source_name = source_name + + @classmethod + def set_db_config(cls, config): + cls.db_config = config + + def fetch_last_post(self): + logging.info(f"[{self.source_name}] checking a new post...") + + try: + with TelegramClient(self.session_name, self.api_id, self.api_hash) as client: + entity = client.get_entity(self.channel_username) + history = client(GetHistoryRequest( + peer=entity, + limit=1, + offset_date=None, + offset_id=0, + max_id=0, + min_id=0, + add_offset=0, + hash=0 + )) + + if history.messages: + msg = history.messages[0] + logging.info(f"[{self.source_name}] received a post: {msg.message[:60]}...") + self.save_to_db(self.source_name, msg.message) + else: + logging.info(f"[{self.source_name}] there is no new messages") + except Exception as e: + logging.error(f"[{self.source_name}] error when receiving a post: {e}") + + def save_to_db(self, source, message): + if not self.db_config: + logging.error("DB config is not set") + return + + try: + conn = psycopg2.connect(**self.db_config) + cur = conn.cursor() + cur.execute( + "INSERT INTO leaks (source, message) VALUES (%s, %s)", + (source, message) + ) + conn.commit() + cur.close() + conn.close() + logging.info(f"[{self.source_name}] message is recorded in the database") + except Exception as e: + logging.error(f"[{self.source_name}] error when writing to the database: {e}") diff --git a/tg/tg_node_0.py b/tg/tg_node_0.py new file mode 100644 index 0000000..7693788 --- /dev/null +++ b/tg/tg_node_0.py @@ -0,0 +1,34 @@ +import asyncio +import os +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from pytz import timezone +from tg_crawler import TelegramChannelMonitor +from dotenv import load_dotenv + +load_dotenv() + +TelegramChannelMonitor.set_db_config({ + 'host': os.getenv("HOST"), + 'port': os.getenv("PORT"), + 'database': os.getenv("DBNAME"), + 'user': os.getenv("USER"), + 'password': os.getenv("PASSWORD") +}) + +monitor = TelegramChannelMonitor( + session_name='session_trueosint', + api_id=os.getenv("TELETHON_API_ID"), + api_hash=os.getenv("TELETHON_API_HASH"), + channel_username='trueosint', + source_name='trueosint' +) + +def main(): + scheduler = AsyncIOScheduler() + scheduler.add_job(monitor.fetch_last_post, "cron", hour=9, minute=0, timezone=timezone("Europe/Moscow")) + scheduler.start() + + asyncio.get_event_loop().run_forever() + +if __name__ == '__main__': + main() diff --git a/tg/tg_node_1.py b/tg/tg_node_1.py new file mode 100644 index 0000000..8d8b8ec --- /dev/null +++ b/tg/tg_node_1.py @@ -0,0 +1,34 @@ +import asyncio +import os +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from pytz import timezone +from tg_crawler import TelegramChannelMonitor +from dotenv import load_dotenv + +load_dotenv() + +TelegramChannelMonitor.set_db_config({ + 'host': os.getenv("HOST"), + 'port': os.getenv("PORT"), + 'database': os.getenv("DBNAME"), + 'user': os.getenv("USER"), + 'password': os.getenv("PASSWORD") +}) + +monitor = TelegramChannelMonitor( + session_name='session_dataleak', + api_id=os.getenv("TELETHON_API_ID"), + api_hash=os.getenv("TELETHON_API_HASH"), + channel_username='dataleak', + source_name='dataleak' +) + +def main(): + scheduler = AsyncIOScheduler() + scheduler.add_job(monitor.fetch_last_post, "cron", hour=9, minute=0, timezone=timezone("Europe/Moscow")) + scheduler.start() + + asyncio.get_event_loop().run_forever() + +if __name__ == '__main__': + main() diff --git a/web/app.py b/web/app.py new file mode 100644 index 0000000..f4f8cec --- /dev/null +++ b/web/app.py @@ -0,0 +1,34 @@ +import os +from flask import Flask, render_template +import psycopg2 + +app = Flask(__name__) + +DB_CONFIG = { + 'host': os.getenv("HOST"), + 'port': os.getenv("PORT"), + 'database': os.getenv("DBNAME"), + 'user': os.getenv("USER"), + 'password': os.getenv("PASSWORD") +} + +@app.route("/") +def index(): + with psycopg2.connect(**DB_CONFIG) as conn: + with conn.cursor() as cur: + cur.execute("SELECT source, message, created_at FROM leaks ORDER BY created_at DESC LIMIT 50") + leaks = cur.fetchall() + return render_template("index.html", leaks=leaks) + +@app.route("/logs") +def logs(): + log_path = os.path.join(os.path.dirname(__file__), '..', 'tg_nodes.log') + try: + with open(log_path, 'r', encoding='utf-8') as f: + lines = f.readlines()[-100:] + except FileNotFoundError: + lines = ["Лог-файл не найден"] + return render_template("logs.html", logs=lines) + +if __name__ == "__main__": + app.run(debug=True) diff --git a/web/templates/index.html b/web/templates/index.html new file mode 100644 index 0000000..6fdcb7a --- /dev/null +++ b/web/templates/index.html @@ -0,0 +1,368 @@ + + + + + + Система мониторинга утечек данных + + + + + + + + + + + + +
+

Система мониторинга утечек данных

+ + +
+

Карта утечек

+
+
+ + +
+
+

Утечки за месяц

+ +
+
+

Состояние парсеров

+ +
+
+

Источники утечек

+ +
+
+

Скриншоты с форумов

+
+ Скриншот 1 + Скриншот 2 + Скриншот 3 + Скриншот 4 + Скриншот 5 +
+
+
+

Логи системы

+
+

[2023-10-01 12:34] Парсер форума запущен.

+

[2023-10-01 12:35] Найдена новая утечка на форуме X.

+

[2023-10-01 12:36] Ошибка парсера Telegram: timeout.

+

[2023-10-01 12:37] Утечка данных в Москве зафиксирована.

+

[2023-10-01 12:38] Парсер даркнета завершил работу.

+

[2023-10-01 12:39] Новый скриншот добавлен в базу.

+
+
+
+
+ + + + + + \ No newline at end of file diff --git a/web/templates/logs.html b/web/templates/logs.html new file mode 100644 index 0000000..a9eb2eb --- /dev/null +++ b/web/templates/logs.html @@ -0,0 +1,246 @@ + + + + + + Логи системы + + + + + + + + + + + +
+ + + +
+ + +
+

Логи системы

+
+

[2023-10-01 12:34] Парсер форума запущен.

+

[2023-10-01 12:35] Найдена новая утечка на форуме X.

+

[2023-10-01 12:36] Ошибка парсера Telegram: timeout.

+

[2023-10-01 12:37] Утечка данных в Москве зафиксирована.

+

[2023-10-01 12:38] Парсер даркнета завершил работу.

+

[2023-10-01 12:39] Новый скриншот добавлен в базу.

+ +
+
+
+ + + + \ No newline at end of file