diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..f0c87b6
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,6 @@
+FROM python:3.12-slim
+WORKDIR /app
+COPY . /app
+RUN pip install --no-cache-dir -r requirements.txt
+EXPOSE 5000
+CMD ["python", "web/app.py"]
diff --git a/docker-compose.yaml b/docker-compose.yaml
new file mode 100644
index 0000000..e754039
--- /dev/null
+++ b/docker-compose.yaml
@@ -0,0 +1,27 @@
+version: '3.8'
+
+services:
+ web:
+ build: .
+ container_name: tg-monitor
+ ports:
+ - "5000:5000"
+ volumes:
+ - .:/app
+ command: python web/app.py
+ depends_on:
+ - tg_node_0
+ - tg_node_1
+
+ tg_node_0:
+ build: .
+ container_name: tg-node-0
+ command: python tg/node_tg_0.py
+
+ tg_node_1:
+ build: .
+ container_name: tg-node-1
+ command: python tg/node_tg_1.py
+
+volumes:
+ pg_data:
diff --git a/patcher.py b/patcher.py
deleted file mode 100644
index 99800dc..0000000
--- a/patcher.py
+++ /dev/null
@@ -1,402 +0,0 @@
-#!/usr/bin/env python3
-# this module is part of undetected_chromedriver
-
-from distutils.version import LooseVersion
-import io
-import json
-import logging
-import os
-import pathlib
-import platform
-import random
-import re
-import shutil
-import string
-import sys
-import time
-from urllib.request import urlopen
-from urllib.request import urlretrieve
-import zipfile
-from multiprocessing import Lock
-import secrets
-
-logger = logging.getLogger(__name__)
-
-IS_POSIX = sys.platform.startswith(("darwin", "cygwin", "linux", "linux2"))
-
-
-class Patcher(object):
- lock = Lock()
- exe_name = "chromedriver%s"
-
- platform = sys.platform
- if platform.endswith("win32"):
- d = "~/appdata/roaming/undetected_chromedriver"
- elif "LAMBDA_TASK_ROOT" in os.environ:
- d = "/tmp/undetected_chromedriver"
- elif platform.startswith(("linux", "linux2")):
- d = "~/.local/share/undetected_chromedriver"
- elif platform.endswith("darwin"):
- d = "~/Library/Application Support/undetected_chromedriver"
- else:
- d = "~/.undetected_chromedriver"
- data_path = os.path.abspath(os.path.expanduser(d))
-
- def __init__(
- self,
- executable_path=None,
- force=False,
- version_main: int = 0,
- user_multi_procs=False,
- ):
- """
- Args:
- executable_path: None = automatic
- a full file path to the chromedriver executable
- force: False
- terminate processes which are holding lock
- version_main: 0 = auto
- specify main chrome version (rounded, ex: 82)
- """
- self.force = force
- self._custom_exe_path = False
- prefix = secrets.token_hex(8)
- self.user_multi_procs = user_multi_procs
-
- self.is_old_chromedriver = version_main and version_main <= 114
- # Needs to be called before self.exe_name is accessed
- self._set_platform_name()
-
- if not os.path.exists(self.data_path):
- os.makedirs(self.data_path, exist_ok=True)
-
- if not executable_path:
- self.executable_path = os.path.join(
- self.data_path, "_".join([prefix, self.exe_name])
- )
-
- if not IS_POSIX:
- if executable_path:
- if not executable_path[-4:] == ".exe":
- executable_path += ".exe"
-
- self.zip_path = os.path.join(self.data_path, prefix)
-
- if not executable_path:
- if not self.user_multi_procs:
- self.executable_path = os.path.abspath(
- os.path.join(".", self.executable_path)
- )
-
- if executable_path:
- self._custom_exe_path = True
- self.executable_path = executable_path
-
- # Set the correct repository to download the Chromedriver from
- if self.is_old_chromedriver:
- self.url_repo = "https://chromedriver.storage.googleapis.com"
- else:
- self.url_repo = "https://googlechromelabs.github.io/chrome-for-testing"
-
- self.version_main = version_main
- self.version_full = None
-
- def _set_platform_name(self):
- """
- Set the platform and exe name based on the platform undetected_chromedriver is running on
- in order to download the correct chromedriver.
- """
- if self.platform.endswith("win32"):
- self.platform_name = "win32"
- self.exe_name %= ".exe"
- if self.platform.endswith(("linux", "linux2")):
- self.platform_name = "linux64"
- self.exe_name %= ""
- if self.platform.endswith("darwin"):
- if self.is_old_chromedriver:
- self.platform_name = "mac64"
- else:
- self.platform_name = "mac-x64"
- self.exe_name %= ""
-
- def auto(self, executable_path=None, force=False, version_main=None, _=None):
- """
-
- Args:
- executable_path:
- force:
- version_main:
-
- Returns:
-
- """
- p = pathlib.Path(self.data_path)
- if self.user_multi_procs:
- with Lock():
- files = list(p.rglob("*chromedriver*"))
- most_recent = max(files, key=lambda f: f.stat().st_mtime)
- files.remove(most_recent)
- list(map(lambda f: f.unlink(), files))
- if self.is_binary_patched(most_recent):
- self.executable_path = str(most_recent)
- return True
-
- if executable_path:
- self.executable_path = executable_path
- self._custom_exe_path = True
-
- if self._custom_exe_path:
- ispatched = self.is_binary_patched(self.executable_path)
- if not ispatched:
- return self.patch_exe()
- else:
- return
-
- if version_main:
- self.version_main = version_main
- if force is True:
- self.force = force
-
- try:
- os.unlink(self.executable_path)
- except PermissionError:
- if self.force:
- self.force_kill_instances(self.executable_path)
- return self.auto(force=not self.force)
- try:
- if self.is_binary_patched():
- # assumes already running AND patched
- return True
- except PermissionError:
- pass
- # return False
- except FileNotFoundError:
- pass
-
- release = self.fetch_release_number()
- self.version_main = release.version[0]
- self.version_full = release
- self.unzip_package(self.fetch_package())
- return self.patch()
-
- def driver_binary_in_use(self, path: str = None) -> bool:
- """
- naive test to check if a found chromedriver binary is
- currently in use
-
- Args:
- path: a string or PathLike object to the binary to check.
- if not specified, we check use this object's executable_path
- """
- if not path:
- path = self.executable_path
- p = pathlib.Path(path)
-
- if not p.exists():
- raise OSError("file does not exist: %s" % p)
- try:
- with open(p, mode="a+b") as fs:
- exc = []
- try:
-
- fs.seek(0, 0)
- except PermissionError as e:
- exc.append(e) # since some systems apprently allow seeking
- # we conduct another test
- try:
- fs.readline()
- except PermissionError as e:
- exc.append(e)
-
- if exc:
-
- return True
- return False
- # ok safe to assume this is in use
- except Exception as e:
- # logger.exception("whoops ", e)
- pass
-
- def cleanup_unused_files(self):
- p = pathlib.Path(self.data_path)
- items = list(p.glob("*undetected*"))
- for item in items:
- try:
- item.unlink()
- except:
- pass
-
- def patch(self):
- self.patch_exe()
- return self.is_binary_patched()
-
- def fetch_release_number(self):
- """
- Gets the latest major version available, or the latest major version of self.target_version if set explicitly.
- :return: version string
- :rtype: LooseVersion
- """
- # Endpoint for old versions of Chromedriver (114 and below)
- if self.is_old_chromedriver:
- path = f"/latest_release_{self.version_main}"
- path = path.upper()
- logger.debug("getting release number from %s" % path)
- return LooseVersion(urlopen(self.url_repo + path).read().decode())
-
- # Endpoint for new versions of Chromedriver (115+)
- if not self.version_main:
- # Fetch the latest version
- path = "/last-known-good-versions-with-downloads.json"
- logger.debug("getting release number from %s" % path)
- with urlopen(self.url_repo + path) as conn:
- response = conn.read().decode()
-
- last_versions = json.loads(response)
- return LooseVersion(last_versions["channels"]["Stable"]["version"])
-
- # Fetch the latest minor version of the major version provided
- path = "/latest-versions-per-milestone-with-downloads.json"
- logger.debug("getting release number from %s" % path)
- with urlopen(self.url_repo + path) as conn:
- response = conn.read().decode()
-
- major_versions = json.loads(response)
- return LooseVersion(major_versions["milestones"][str(self.version_main)]["version"])
-
- def parse_exe_version(self):
- with io.open(self.executable_path, "rb") as f:
- for line in iter(lambda: f.readline(), b""):
- match = re.search(rb"platform_handle\x00content\x00([0-9.]*)", line)
- if match:
- return LooseVersion(match[1].decode())
-
- def fetch_package(self):
- """
- Downloads ChromeDriver from source
-
- :return: path to downloaded file
- """
- zip_name = f"chromedriver_{self.platform_name}.zip"
- if self.is_old_chromedriver:
- download_url = "%s/%s/%s" % (self.url_repo, self.version_full.vstring, zip_name)
- else:
- zip_name = zip_name.replace("_", "-", 1)
- download_url = "https://storage.googleapis.com/chrome-for-testing-public/%s/%s/%s"
- download_url %= (self.version_full.vstring, self.platform_name, zip_name)
-
- logger.debug("downloading from %s" % download_url)
- return urlretrieve(download_url)[0]
-
- def unzip_package(self, fp):
- """
- Does what it says
-
- :return: path to unpacked executable
- """
- exe_path = self.exe_name
- if not self.is_old_chromedriver:
- # The new chromedriver unzips into its own folder
- zip_name = f"chromedriver-{self.platform_name}"
- exe_path = os.path.join(zip_name, self.exe_name)
-
- logger.debug("unzipping %s" % fp)
- try:
- os.unlink(self.zip_path)
- except (FileNotFoundError, OSError):
- pass
-
- os.makedirs(self.zip_path, mode=0o755, exist_ok=True)
- with zipfile.ZipFile(fp, mode="r") as zf:
- zf.extractall(self.zip_path)
- os.rename(os.path.join(self.zip_path, exe_path), self.executable_path)
- os.remove(fp)
- shutil.rmtree(self.zip_path)
- os.chmod(self.executable_path, 0o755)
- return self.executable_path
-
- @staticmethod
- def force_kill_instances(exe_name):
- """
- kills running instances.
- :param: executable name to kill, may be a path as well
-
- :return: True on success else False
- """
- exe_name = os.path.basename(exe_name)
- if IS_POSIX:
- r = os.system("kill -f -9 $(pidof %s)" % exe_name)
- else:
- r = os.system("taskkill /f /im %s" % exe_name)
- return not r
-
- @staticmethod
- def gen_random_cdc():
- cdc = random.choices(string.ascii_letters, k=27)
- return "".join(cdc).encode()
-
- def is_binary_patched(self, executable_path=None):
- executable_path = executable_path or self.executable_path
- try:
- with io.open(executable_path, "rb") as fh:
- return fh.read().find(b"undetected chromedriver") != -1
- except FileNotFoundError:
- return False
-
- def patch_exe(self):
- start = time.perf_counter()
- logger.info("patching driver executable %s" % self.executable_path)
- with io.open(self.executable_path, "r+b") as fh:
- content = fh.read()
- # match_injected_codeblock = re.search(rb"{window.*;}", content)
- match_injected_codeblock = re.search(rb"\{window\.cdc.*?;\}", content)
- if match_injected_codeblock:
- target_bytes = match_injected_codeblock[0]
- new_target_bytes = (
- b'{console.log("undetected chromedriver 1337!")}'.ljust(
- len(target_bytes), b" "
- )
- )
- new_content = content.replace(target_bytes, new_target_bytes)
- if new_content == content:
- logger.warning(
- "something went wrong patching the driver binary. could not find injection code block"
- )
- else:
- logger.debug(
- "found block:\n%s\nreplacing with:\n%s"
- % (target_bytes, new_target_bytes)
- )
- fh.seek(0)
- fh.write(new_content)
- logger.debug(
- "patching took us {:.2f} seconds".format(time.perf_counter() - start)
- )
-
- def __repr__(self):
- return "{0:s}({1:s})".format(
- self.__class__.__name__,
- self.executable_path,
- )
-
- def __del__(self):
- if self._custom_exe_path:
- # if the driver binary is specified by user
- # we assume it is important enough to not delete it
- return
- else:
- timeout = 3 # stop trying after this many seconds
- t = time.monotonic()
- now = lambda: time.monotonic()
- while now() - t > timeout:
- # we don't want to wait until the end of time
- try:
- if self.user_multi_procs:
- break
- os.unlink(self.executable_path)
- logger.debug("successfully unlinked %s" % self.executable_path)
- break
- except (OSError, RuntimeError, PermissionError):
- time.sleep(0.01)
- continue
- except FileNotFoundError:
- break
diff --git a/requirements.txt b/requirements.txt
index 0d8b373..aafe21d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,7 @@ python-dotenv
tls-client
undetected-chromedriver
selenium
-grpc_interceptor_headers
\ No newline at end of file
+grpc_interceptor_headers
+telethon
+schedule
+psycopg2-binary
\ No newline at end of file
diff --git a/runner.sh b/runner.sh
deleted file mode 100644
index 185249e..0000000
--- a/runner.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-TARGET_DIR=$(python -c "import undetected_chromedriver as uc; print(uc.__path__[0])" 2>/dev/null)
-
-if [ -d "$TARGET_DIR" ]; then
- echo "patcher.py в $TARGET_DIR"
- cp patcher.py "$TARGET_DIR"
-else
- echo "undetected_chromedriver не найден"
-fi
-
-pip install -r requirements.txt
-
-python initiator.py
\ No newline at end of file
diff --git a/tg/tg_crawler.py b/tg/tg_crawler.py
new file mode 100644
index 0000000..eefe1df
--- /dev/null
+++ b/tg/tg_crawler.py
@@ -0,0 +1,73 @@
+import logging
+import psycopg2
+from datetime import datetime
+from telethon.sync import TelegramClient
+from telethon.tl.functions.messages import GetHistoryRequest
+
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(levelname)s - %(message)s',
+ handlers=[
+ logging.FileHandler('tg_nodes.log'),
+ logging.StreamHandler()
+ ]
+)
+
+class TelegramChannelMonitor:
+ db_config = None
+
+ def __init__(self, session_name, api_id, api_hash, channel_username, source_name):
+ self.session_name = session_name
+ self.api_id = api_id
+ self.api_hash = api_hash
+ self.channel_username = channel_username
+ self.source_name = source_name
+
+ @classmethod
+ def set_db_config(cls, config):
+ cls.db_config = config
+
+ def fetch_last_post(self):
+ logging.info(f"[{self.source_name}] checking a new post...")
+
+ try:
+ with TelegramClient(self.session_name, self.api_id, self.api_hash) as client:
+ entity = client.get_entity(self.channel_username)
+ history = client(GetHistoryRequest(
+ peer=entity,
+ limit=1,
+ offset_date=None,
+ offset_id=0,
+ max_id=0,
+ min_id=0,
+ add_offset=0,
+ hash=0
+ ))
+
+ if history.messages:
+ msg = history.messages[0]
+ logging.info(f"[{self.source_name}] received a post: {msg.message[:60]}...")
+ self.save_to_db(self.source_name, msg.message)
+ else:
+ logging.info(f"[{self.source_name}] there is no new messages")
+ except Exception as e:
+ logging.error(f"[{self.source_name}] error when receiving a post: {e}")
+
+ def save_to_db(self, source, message):
+ if not self.db_config:
+ logging.error("DB config is not set")
+ return
+
+ try:
+ conn = psycopg2.connect(**self.db_config)
+ cur = conn.cursor()
+ cur.execute(
+ "INSERT INTO leaks (source, message) VALUES (%s, %s)",
+ (source, message)
+ )
+ conn.commit()
+ cur.close()
+ conn.close()
+ logging.info(f"[{self.source_name}] message is recorded in the database")
+ except Exception as e:
+ logging.error(f"[{self.source_name}] error when writing to the database: {e}")
diff --git a/tg/tg_node_0.py b/tg/tg_node_0.py
new file mode 100644
index 0000000..7693788
--- /dev/null
+++ b/tg/tg_node_0.py
@@ -0,0 +1,34 @@
+import asyncio
+import os
+from apscheduler.schedulers.asyncio import AsyncIOScheduler
+from pytz import timezone
+from tg_crawler import TelegramChannelMonitor
+from dotenv import load_dotenv
+
+load_dotenv()
+
+TelegramChannelMonitor.set_db_config({
+ 'host': os.getenv("HOST"),
+ 'port': os.getenv("PORT"),
+ 'database': os.getenv("DBNAME"),
+ 'user': os.getenv("USER"),
+ 'password': os.getenv("PASSWORD")
+})
+
+monitor = TelegramChannelMonitor(
+ session_name='session_trueosint',
+ api_id=os.getenv("TELETHON_API_ID"),
+ api_hash=os.getenv("TELETHON_API_HASH"),
+ channel_username='trueosint',
+ source_name='trueosint'
+)
+
+def main():
+ scheduler = AsyncIOScheduler()
+ scheduler.add_job(monitor.fetch_last_post, "cron", hour=9, minute=0, timezone=timezone("Europe/Moscow"))
+ scheduler.start()
+
+ asyncio.get_event_loop().run_forever()
+
+if __name__ == '__main__':
+ main()
diff --git a/tg/tg_node_1.py b/tg/tg_node_1.py
new file mode 100644
index 0000000..8d8b8ec
--- /dev/null
+++ b/tg/tg_node_1.py
@@ -0,0 +1,34 @@
+import asyncio
+import os
+from apscheduler.schedulers.asyncio import AsyncIOScheduler
+from pytz import timezone
+from tg_crawler import TelegramChannelMonitor
+from dotenv import load_dotenv
+
+load_dotenv()
+
+TelegramChannelMonitor.set_db_config({
+ 'host': os.getenv("HOST"),
+ 'port': os.getenv("PORT"),
+ 'database': os.getenv("DBNAME"),
+ 'user': os.getenv("USER"),
+ 'password': os.getenv("PASSWORD")
+})
+
+monitor = TelegramChannelMonitor(
+ session_name='session_dataleak',
+ api_id=os.getenv("TELETHON_API_ID"),
+ api_hash=os.getenv("TELETHON_API_HASH"),
+ channel_username='dataleak',
+ source_name='dataleak'
+)
+
+def main():
+ scheduler = AsyncIOScheduler()
+ scheduler.add_job(monitor.fetch_last_post, "cron", hour=9, minute=0, timezone=timezone("Europe/Moscow"))
+ scheduler.start()
+
+ asyncio.get_event_loop().run_forever()
+
+if __name__ == '__main__':
+ main()
diff --git a/web/app.py b/web/app.py
new file mode 100644
index 0000000..f4f8cec
--- /dev/null
+++ b/web/app.py
@@ -0,0 +1,34 @@
+import os
+from flask import Flask, render_template
+import psycopg2
+
+app = Flask(__name__)
+
+DB_CONFIG = {
+ 'host': os.getenv("HOST"),
+ 'port': os.getenv("PORT"),
+ 'database': os.getenv("DBNAME"),
+ 'user': os.getenv("USER"),
+ 'password': os.getenv("PASSWORD")
+}
+
+@app.route("/")
+def index():
+ with psycopg2.connect(**DB_CONFIG) as conn:
+ with conn.cursor() as cur:
+ cur.execute("SELECT source, message, created_at FROM leaks ORDER BY created_at DESC LIMIT 50")
+ leaks = cur.fetchall()
+ return render_template("index.html", leaks=leaks)
+
+@app.route("/logs")
+def logs():
+ log_path = os.path.join(os.path.dirname(__file__), '..', 'tg_nodes.log')
+ try:
+ with open(log_path, 'r', encoding='utf-8') as f:
+ lines = f.readlines()[-100:]
+ except FileNotFoundError:
+ lines = ["Лог-файл не найден"]
+ return render_template("logs.html", logs=lines)
+
+if __name__ == "__main__":
+ app.run(debug=True)
diff --git a/web/templates/index.html b/web/templates/index.html
new file mode 100644
index 0000000..6fdcb7a
--- /dev/null
+++ b/web/templates/index.html
@@ -0,0 +1,368 @@
+
+
+
+
+
+ Система мониторинга утечек данных
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Система мониторинга утечек данных
+
+
+
+
+
+
+
+
Утечки за месяц
+
+
+
+
Состояние парсеров
+
+
+
+
Источники утечек
+
+
+
+
Скриншоты с форумов
+
+
+
+
Логи системы
+
+
[2023-10-01 12:34] Парсер форума запущен.
+
[2023-10-01 12:35] Найдена новая утечка на форуме X.
+
[2023-10-01 12:36] Ошибка парсера Telegram: timeout.
+
[2023-10-01 12:37] Утечка данных в Москве зафиксирована.
+
[2023-10-01 12:38] Парсер даркнета завершил работу.
+
[2023-10-01 12:39] Новый скриншот добавлен в базу.
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/web/templates/logs.html b/web/templates/logs.html
new file mode 100644
index 0000000..a9eb2eb
--- /dev/null
+++ b/web/templates/logs.html
@@ -0,0 +1,246 @@
+
+
+
+
+
+ Логи системы
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Логи системы
+
+
[2023-10-01 12:34] Парсер форума запущен.
+
[2023-10-01 12:35] Найдена новая утечка на форуме X.
+
[2023-10-01 12:36] Ошибка парсера Telegram: timeout.
+
[2023-10-01 12:37] Утечка данных в Москве зафиксирована.
+
[2023-10-01 12:38] Парсер даркнета завершил работу.
+
[2023-10-01 12:39] Новый скриншот добавлен в базу.
+
+
+
+
+
+
+
+
\ No newline at end of file