rm: dld.py, test
This commit is contained in:
parent
44ce626a1b
commit
9788b5aa4e
97
dld.py
97
dld.py
@ -1,97 +0,0 @@
|
|||||||
from bs4 import BeautifulSoup
|
|
||||||
import os
|
|
||||||
import psycopg2
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
from celery import Celery
|
|
||||||
from utils.logg import LoggerSingleton
|
|
||||||
from driver.driver_creator import DriverCreator
|
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
from time import sleep
|
|
||||||
from celery_app import app
|
|
||||||
|
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
logger = LoggerSingleton.get_logger()
|
|
||||||
|
|
||||||
class ForumCrawler:
|
|
||||||
def __init__(self, forum_url):
|
|
||||||
self.forum_url = forum_url
|
|
||||||
# self.proxy_list = proxy_list
|
|
||||||
# self.db_config = {
|
|
||||||
# 'host': os.getenv('DB_HOST'),
|
|
||||||
# 'port': os.getenv('DB_PORT'),
|
|
||||||
# 'database': os.getenv('DB_NAME'),
|
|
||||||
# 'user': os.getenv('DB_USER'),
|
|
||||||
# 'password': os.getenv('DB_PASSWORD')
|
|
||||||
# }
|
|
||||||
self.conn = None
|
|
||||||
# self._connect()
|
|
||||||
|
|
||||||
def _connect(self):
|
|
||||||
try:
|
|
||||||
self.conn = psycopg2.connect(**self.db_config)
|
|
||||||
logger.info("Database connection established")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Database connection error: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def _close(self):
|
|
||||||
if self.conn and not self.conn.closed:
|
|
||||||
try:
|
|
||||||
self.conn.close()
|
|
||||||
logger.info("Database connection closed")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error closing database connection: {e}")
|
|
||||||
|
|
||||||
def _crawl_leaks(self):
|
|
||||||
driver_creator = DriverCreator([])
|
|
||||||
driver = driver_creator.get_driver()
|
|
||||||
|
|
||||||
try:
|
|
||||||
logger.info(f"Starting forum crawl: {self.forum_url}")
|
|
||||||
driver.get(self.forum_url)
|
|
||||||
sleep(5)
|
|
||||||
|
|
||||||
posts = driver.find_elements(By.CSS_SELECTOR, 'a.story-link h2.home-title')
|
|
||||||
if not posts:
|
|
||||||
logger.info("No posts found on the page")
|
|
||||||
return
|
|
||||||
last_post = posts[0]
|
|
||||||
title = last_post.text.strip()
|
|
||||||
link = last_post.get_attribute('href')
|
|
||||||
post_content = f"{title} - {link}"
|
|
||||||
|
|
||||||
if 'data breach' in title.lower() or 'leak' in title.lower():
|
|
||||||
logger.info(post_content)
|
|
||||||
# self._save_to_db('Hacker News', post_content)
|
|
||||||
logger.info(f"New leak found: {title} - {link}")
|
|
||||||
else:
|
|
||||||
logger.info("Last post is not about leaks")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error during forum crawling: {e}")
|
|
||||||
raise
|
|
||||||
finally:
|
|
||||||
driver.quit()
|
|
||||||
logger.info("WebDriver session closed")
|
|
||||||
|
|
||||||
def _save_to_db(self, source, message):
|
|
||||||
if not self.conn or self.conn.closed:
|
|
||||||
self._connect()
|
|
||||||
try:
|
|
||||||
with self.conn.cursor() as cur:
|
|
||||||
cur.execute(
|
|
||||||
"INSERT INTO leaks (resource_name, message) VALUES (%s, %s)",
|
|
||||||
(source, message)
|
|
||||||
)
|
|
||||||
self.conn.commit()
|
|
||||||
logger.info(f"Leak from {source} saved in the database")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error writing to the database: {e}")
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
crawler = ForumCrawler('https://thehackernews.com/search/label/data%20breach')
|
|
||||||
try:
|
|
||||||
crawler._crawl_leaks()
|
|
||||||
finally:
|
|
||||||
crawler._close()
|
|
Loading…
Reference in New Issue
Block a user