rm: dld.py, test
This commit is contained in:
		
							parent
							
								
									44ce626a1b
								
							
						
					
					
						commit
						9788b5aa4e
					
				
							
								
								
									
										97
									
								
								dld.py
									
									
									
									
									
								
							
							
						
						
									
										97
									
								
								dld.py
									
									
									
									
									
								
							| @ -1,97 +0,0 @@ | ||||
| from bs4 import BeautifulSoup | ||||
| import os | ||||
| import psycopg2 | ||||
| from dotenv import load_dotenv | ||||
| from celery import Celery | ||||
| from utils.logg import LoggerSingleton | ||||
| from driver.driver_creator import DriverCreator | ||||
| from selenium.webdriver.common.by import By | ||||
| from time import sleep | ||||
| from celery_app import app | ||||
| 
 | ||||
| load_dotenv() | ||||
| 
 | ||||
| logger = LoggerSingleton.get_logger() | ||||
| 
 | ||||
| class ForumCrawler: | ||||
|     def __init__(self, forum_url): | ||||
|         self.forum_url = forum_url | ||||
|         # self.proxy_list = proxy_list | ||||
|         # self.db_config = { | ||||
|         #     'host': os.getenv('DB_HOST'), | ||||
|         #     'port': os.getenv('DB_PORT'), | ||||
|         #     'database': os.getenv('DB_NAME'), | ||||
|         #     'user': os.getenv('DB_USER'), | ||||
|         #     'password': os.getenv('DB_PASSWORD') | ||||
|         # } | ||||
|         self.conn = None | ||||
|         # self._connect() | ||||
| 
 | ||||
|     def _connect(self): | ||||
|         try: | ||||
|             self.conn = psycopg2.connect(**self.db_config) | ||||
|             logger.info("Database connection established") | ||||
|         except Exception as e: | ||||
|             logger.error(f"Database connection error: {e}") | ||||
|             raise | ||||
| 
 | ||||
|     def _close(self): | ||||
|         if self.conn and not self.conn.closed: | ||||
|             try: | ||||
|                 self.conn.close() | ||||
|                 logger.info("Database connection closed") | ||||
|             except Exception as e: | ||||
|                 logger.error(f"Error closing database connection: {e}") | ||||
| 
 | ||||
|     def _crawl_leaks(self): | ||||
|         driver_creator = DriverCreator([]) | ||||
|         driver = driver_creator.get_driver() | ||||
| 
 | ||||
|         try: | ||||
|             logger.info(f"Starting forum crawl: {self.forum_url}") | ||||
|             driver.get(self.forum_url) | ||||
|             sleep(5) | ||||
| 
 | ||||
|             posts = driver.find_elements(By.CSS_SELECTOR, 'a.story-link h2.home-title') | ||||
|             if not posts: | ||||
|                 logger.info("No posts found on the page") | ||||
|                 return | ||||
|             last_post = posts[0] | ||||
|             title = last_post.text.strip() | ||||
|             link = last_post.get_attribute('href') | ||||
|             post_content = f"{title} - {link}" | ||||
| 
 | ||||
|             if 'data breach' in title.lower() or 'leak' in title.lower(): | ||||
|                 logger.info(post_content) | ||||
|                 # self._save_to_db('Hacker News', post_content) | ||||
|                 logger.info(f"New leak found: {title} - {link}") | ||||
|             else: | ||||
|                 logger.info("Last post is not about leaks") | ||||
| 
 | ||||
|         except Exception as e: | ||||
|             logger.error(f"Error during forum crawling: {e}") | ||||
|             raise | ||||
|         finally: | ||||
|             driver.quit() | ||||
|             logger.info("WebDriver session closed") | ||||
|          | ||||
|     def _save_to_db(self, source, message): | ||||
|         if not self.conn or self.conn.closed: | ||||
|             self._connect() | ||||
|         try: | ||||
|             with self.conn.cursor() as cur: | ||||
|                 cur.execute( | ||||
|                     "INSERT INTO leaks (resource_name, message) VALUES (%s, %s)", | ||||
|                     (source, message) | ||||
|                 ) | ||||
|             self.conn.commit() | ||||
|             logger.info(f"Leak from {source} saved in the database") | ||||
|         except Exception as e: | ||||
|             logger.error(f"Error writing to the database: {e}") | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     crawler = ForumCrawler('https://thehackernews.com/search/label/data%20breach') | ||||
|     try: | ||||
|         crawler._crawl_leaks() | ||||
|     finally: | ||||
|         crawler._close() | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user