Initial commit v3

This commit is contained in:
Xargana 2025-07-13 21:49:22 +03:00
parent 8edda894db
commit a9bcce85d6
5 changed files with 506 additions and 192 deletions

24
main.py
View file

@ -9,10 +9,26 @@ import logging
import sys import sys
from pathlib import Path from pathlib import Path
from src.client import DiscordDataClient # Check if we're in the right directory
from src.config import Config if not Path("src").exists():
from src.database import JSONDatabase print("❌ Error: 'src' directory not found. Please run from the project root directory.")
from src.logger import setup_logger sys.exit(1)
# Add src to Python path
sys.path.insert(0, str(Path(__file__).parent))
try:
from src.client import DiscordDataClient
from src.config import Config
from src.database import JSONDatabase
from src.logger import setup_logger
except ImportError as e:
print(f"❌ Import error: {e}")
print("\n🔧 To fix this, try:")
print("1. Run: python setup.py")
print("2. Or run: python test_imports.py")
print("3. Or install dependencies: pip install discord.py-self python-dotenv toml colorlog")
sys.exit(1)
async def main(): async def main():

View file

@ -7,14 +7,14 @@ discord.py-self>=2.0.0
python-dotenv>=1.0.0 python-dotenv>=1.0.0
toml>=0.10.2 toml>=0.10.2
# Database (for future MongoDB integration) # For future MongoDB integration
pymongo>=4.0.0 pymongo>=4.0.0
# Async utilities
asyncio-throttle>=1.0.0
# Data processing
pandas>=1.5.0
# Logging # Logging
colorlog>=6.0.0 colorlog>=6.0.0
# Standard library backports (if needed)
typing-extensions>=4.0.0
# Optional: For better async performance
uvloop>=0.17.0; sys_platform != "win32"

165
setup.py Normal file
View file

@ -0,0 +1,165 @@
#!/usr/bin/env python3
"""
Setup script for Discord Data Collector
"""
import os
import sys
import subprocess
from pathlib import Path
def check_python_version():
"""Check if Python version is compatible."""
if sys.version_info < (3, 8):
print("❌ Python 3.8 or higher is required")
sys.exit(1)
print(f"✅ Python {sys.version_info.major}.{sys.version_info.minor} detected")
def install_dependencies():
"""Install required dependencies."""
print("📦 Installing dependencies...")
dependencies = [
"discord.py-self>=2.0.0",
"python-dotenv>=1.0.0",
"toml>=0.10.2",
"colorlog>=6.0.0"
]
for dep in dependencies:
try:
print(f"Installing {dep}...")
subprocess.check_call([sys.executable, "-m", "pip", "install", dep])
except subprocess.CalledProcessError as e:
print(f"❌ Failed to install {dep}: {e}")
return False
print("✅ All dependencies installed successfully")
return True
def create_directories():
"""Create necessary directories."""
directories = [
"data",
"data/backups",
"logs",
"src"
]
for directory in directories:
Path(directory).mkdir(parents=True, exist_ok=True)
print("✅ Directories created")
def create_config_files():
"""Create configuration files if they don't exist."""
# Create .env file
env_file = Path(".env")
if not env_file.exists():
env_content = """# Discord Data Collector Environment Variables
# Add your Discord user token here
DISCORD_TOKEN=your_discord_user_token_here
"""
with open(env_file, "w") as f:
f.write(env_content)
print("✅ Created .env file")
# Create config.toml file
config_file = Path("config.toml")
if not config_file.exists():
config_content = """# Discord Data Collector Configuration
[database]
path = "data/users.json"
backup_interval = 3600
[collection]
profile_pictures = true
bio = true
status = true
server_membership = true
[rate_limiting]
request_delay = 1.0
max_requests_per_minute = 30
[monitoring]
target_servers = []
monitor_all_servers = true
[logging]
level = "INFO"
file = "logs/collector.log"
"""
with open(config_file, "w") as f:
f.write(config_content)
print("✅ Created config.toml file")
def test_imports():
"""Test if all imports work correctly."""
print("🧪 Testing imports...")
try:
import discord
print("✅ discord.py-self imported successfully")
except ImportError as e:
print(f"❌ Failed to import discord.py-self: {e}")
return False
try:
import toml
print("✅ toml imported successfully")
except ImportError as e:
print(f"❌ Failed to import toml: {e}")
return False
try:
from dotenv import load_dotenv
print("✅ python-dotenv imported successfully")
except ImportError as e:
print(f"❌ Failed to import python-dotenv: {e}")
return False
return True
def main():
"""Main setup function."""
print("🚀 Discord Data Collector Setup")
print("=" * 40)
# Check Python version
check_python_version()
# Create directories
create_directories()
# Install dependencies
if not install_dependencies():
print("❌ Setup failed during dependency installation")
sys.exit(1)
# Test imports
if not test_imports():
print("❌ Setup failed during import testing")
sys.exit(1)
# Create config files
create_config_files()
print("\n✅ Setup completed successfully!")
print("\n📝 Next steps:")
print("1. Edit .env file and add your Discord token")
print("2. Optionally modify config.toml settings")
print("3. Run: python main.py")
print("\n⚠️ Remember: This tool is for educational/research purposes only")
if __name__ == "__main__":
main()

View file

@ -1,205 +1,244 @@
""" """
JSON database manager for Discord user data storage. Discord client implementation for data collection.
""" """
import json
import asyncio import asyncio
import shutil
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict
import logging import logging
from datetime import datetime
from typing import Optional, Set
try:
import discord
from discord.ext import tasks
except ImportError:
raise ImportError("discord.py-self is required. Install with: pip install discord.py-self")
from .config import Config
from .database import JSONDatabase, UserData
from .rate_limiter import RateLimiter
@dataclass class DiscordDataClient(discord.Client):
class UserData: """Custom Discord client for collecting user data."""
"""Data structure for storing user information."""
user_id: int
username: str
discriminator: str
display_name: Optional[str] = None
avatar_url: Optional[str] = None
banner_url: Optional[str] = None
bio: Optional[str] = None
status: Optional[str] = None
activity: Optional[str] = None
servers: List[int] = None
created_at: str = None
updated_at: str = None
def __post_init__(self): def __init__(self, config: Config, database: JSONDatabase):
if self.servers is None:
self.servers = []
super().__init__()
self.config = config
self.database = database
self.rate_limiter = RateLimiter(
requests_per_minute=config.max_requests_per_minute,
delay_between_requests=config.request_delay
)
current_time = datetime.utcnow().isoformat()
if self.created_at is None:
self.created_at = current_time
self.updated_at = current_time
class JSONDatabase:
"""JSON-based database for storing Discord user data."""
def __init__(self, database_path: str):
"""Initialize the JSON database."""
self.database_path = Path(database_path)
self.backup_path = Path("data/backups")
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
self._lock = asyncio.Lock() self.processed_users: Set[int] = set()
self._data: Dict[str, Dict] = {} self.target_servers = set(config.get_target_servers())
# Ensure database directory exists # Start background tasks
self.database_path.parent.mkdir(parents=True, exist_ok=True) self.cleanup_task.start()
self.backup_path.mkdir(parents=True, exist_ok=True) self.stats_task.start()
async def on_ready(self):
"""Called when the client is ready."""
self.logger.info(f"Logged in as {self.user} (ID: {self.user.id})")
self.logger.info(f"Connected to {len(self.guilds)} servers")
# Load existing data # Initial scan of server members
self._load_data() await self._scan_all_servers()
def _load_data(self): async def on_message(self, message):
"""Load data from JSON file.""" """Handle incoming messages."""
if self.database_path.exists(): # Skip messages from bots
try: if message.author.bot:
with open(self.database_path, 'r', encoding='utf-8') as f: return
self._data = json.load(f)
self.logger.info(f"Loaded {len(self._data)} users from database")
except Exception as e:
self.logger.error(f"Error loading database: {e}")
self._data = {}
else:
self._data = {}
self.logger.info("Created new database")
async def _save_data(self):
"""Save data to JSON file."""
async with self._lock:
try:
# Create backup before saving
if self.database_path.exists():
backup_filename = f"users_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
backup_path = self.backup_path / backup_filename
shutil.copy2(self.database_path, backup_path)
# Save data
with open(self.database_path, 'w', encoding='utf-8') as f:
json.dump(self._data, f, indent=2, ensure_ascii=False)
self.logger.debug(f"Saved {len(self._data)} users to database")
except Exception as e:
self.logger.error(f"Error saving database: {e}")
async def get_user(self, user_id: int) -> Optional[UserData]:
"""Get user data by ID."""
user_key = str(user_id)
if user_key in self._data:
user_dict = self._data[user_key]
return UserData(**user_dict)
return None
async def save_user(self, user_data: UserData):
"""Save or update user data."""
user_key = str(user_data.user_id)
# If user exists, preserve created_at timestamp # Check if we should monitor this server
if user_key in self._data: if not self._should_monitor_server(message.guild.id):
user_data.created_at = self._data[user_key]['created_at'] return
# Update timestamp # Process the message author
user_data.updated_at = datetime.utcnow().isoformat() await self._process_user(message.author, message.guild.id)
async def on_member_join(self, member):
"""Handle member join events."""
if not self._should_monitor_server(member.guild.id):
return
# Save to memory await self._process_user(member, member.guild.id)
self._data[user_key] = asdict(user_data)
async def on_member_update(self, before, after):
"""Handle member update events."""
if not self._should_monitor_server(after.guild.id):
return
# Save to disk # Only process if relevant data changed
await self._save_data() if (before.display_name != after.display_name or
before.avatar != after.avatar or
before.status != after.status):
await self._process_user(after, after.guild.id)
async def on_user_update(self, before, after):
"""Handle user update events."""
# Process user if they're in any monitored servers
for guild in self.guilds:
if self._should_monitor_server(guild.id):
member = guild.get_member(after.id)
if member:
await self._process_user(member, guild.id)
break
def _should_monitor_server(self, server_id: int) -> bool:
"""Check if we should monitor this server."""
if self.config.monitor_all_servers:
return True
return server_id in self.target_servers
async def _scan_all_servers(self):
"""Scan all server members initially."""
self.logger.info("Starting initial server scan...")
self.logger.debug(f"Saved user {user_data.username}#{user_data.discriminator} ({user_data.user_id})") for guild in self.guilds:
if not self._should_monitor_server(guild.id):
async def add_server_to_user(self, user_id: int, server_id: int): continue
"""Add a server to user's server list."""
user_key = str(user_id)
if user_key in self._data:
if server_id not in self._data[user_key]['servers']:
self._data[user_key]['servers'].append(server_id)
self._data[user_key]['updated_at'] = datetime.utcnow().isoformat()
await self._save_data()
async def get_all_users(self) -> List[UserData]:
"""Get all users from the database."""
return [UserData(**user_dict) for user_dict in self._data.values()]
async def get_users_by_server(self, server_id: int) -> List[UserData]:
"""Get all users that are members of a specific server."""
users = []
for user_dict in self._data.values():
if server_id in user_dict.get('servers', []):
users.append(UserData(**user_dict))
return users
async def get_user_count(self) -> int:
"""Get total number of users in database."""
return len(self._data)
async def get_server_count(self) -> int:
"""Get total number of unique servers."""
servers = set()
for user_dict in self._data.values():
servers.update(user_dict.get('servers', []))
return len(servers)
async def cleanup_old_backups(self, max_backups: int = 10):
"""Clean up old backup files, keeping only the most recent ones."""
backup_files = sorted(self.backup_path.glob("users_backup_*.json"))
if len(backup_files) > max_backups:
files_to_remove = backup_files[:-max_backups]
for file_path in files_to_remove:
try:
file_path.unlink()
self.logger.info(f"Removed old backup: {file_path.name}")
except Exception as e:
self.logger.error(f"Error removing backup {file_path.name}: {e}")
async def export_to_csv(self, output_path: str):
"""Export user data to CSV format."""
import csv
output_path = Path(output_path)
try:
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['user_id', 'username', 'discriminator', 'display_name',
'avatar_url', 'bio', 'status', 'servers', 'created_at', 'updated_at']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for user_dict in self._data.values():
# Convert servers list to string
user_dict_copy = user_dict.copy()
user_dict_copy['servers'] = ','.join(map(str, user_dict.get('servers', [])))
writer.writerow(user_dict_copy)
self.logger.info(f"Exported {len(self._data)} users to {output_path}") self.logger.info(f"Scanning server: {guild.name} ({guild.id})")
try:
# Get all members
members = [member async for member in guild.fetch_members(limit=None)]
for member in members:
if not member.bot:
await self._process_user(member, guild.id)
# Rate limiting
await self.rate_limiter.wait()
self.logger.info(f"Processed {len(members)} members from {guild.name}")
except Exception as e:
self.logger.error(f"Error scanning server {guild.name}: {e}")
self.logger.info("Initial server scan completed")
async def _process_user(self, user, server_id: int):
"""Process a user and save their data."""
try:
# Check if we've already processed this user recently
if user.id in self.processed_users:
# Just add server to existing user
await self.database.add_server_to_user(user.id, server_id)
return
# Rate limiting
await self.rate_limiter.wait()
# Get existing user data
existing_user = await self.database.get_user(user.id)
# Create user data
user_data = UserData(
user_id=user.id,
username=user.name,
discriminator=user.discriminator,
display_name=getattr(user, 'display_name', None),
avatar_url=str(user.avatar.url) if user.avatar else None,
banner_url=str(user.banner.url) if hasattr(user, 'banner') and user.banner else None,
bio=await self._get_user_bio(user),
status=str(user.status) if hasattr(user, 'status') else None,
activity=str(user.activity) if hasattr(user, 'activity') and user.activity else None,
servers=[server_id] if existing_user is None else existing_user.servers,
created_at=existing_user.created_at if existing_user else None
)
# Add server to list if not already there
if server_id not in user_data.servers:
user_data.servers.append(server_id)
# Save user data
await self.database.save_user(user_data)
# Mark as processed
self.processed_users.add(user.id)
self.logger.debug(f"Processed user: {user.name}#{user.discriminator}")
except Exception as e: except Exception as e:
self.logger.error(f"Error exporting to CSV: {e}") self.logger.error(f"Error processing user {user.name}: {e}")
async def get_statistics(self) -> Dict[str, Any]: async def _get_user_bio(self, user) -> Optional[str]:
"""Get database statistics.""" """Get user bio/about me section."""
stats = { if not self.config.collect_bio:
'total_users': await self.get_user_count(), return None
'total_servers': await self.get_server_count(),
'database_size': self.database_path.stat().st_size if self.database_path.exists() else 0
}
# Most active servers try:
server_counts = {} # Try to get user profile
for user_dict in self._data.values(): if hasattr(user, 'id'):
for server_id in user_dict.get('servers', []): profile = await self.fetch_user(user.id)
server_counts[server_id] = server_counts.get(server_id, 0) + 1 return getattr(profile, 'bio', None)
except Exception as e:
self.logger.debug(f"Could not fetch bio for user {user.name}: {e}")
stats['most_active_servers'] = sorted(server_counts.items(), return None
key=lambda x: x[1], reverse=True)[:10]
@tasks.loop(hours=1)
async def cleanup_task(self):
"""Periodic cleanup task."""
try:
# Clean up old backups
await self.database.cleanup_old_backups()
# Clear processed users set to allow re-processing
self.processed_users.clear()
self.logger.info("Cleanup task completed")
except Exception as e:
self.logger.error(f"Error in cleanup task: {e}")
@tasks.loop(minutes=30)
async def stats_task(self):
"""Periodic statistics logging."""
try:
stats = await self.database.get_statistics()
self.logger.info(f"Database stats: {stats['total_users']} users, "
f"{stats['total_servers']} servers, "
f"{stats['database_size']} bytes")
except Exception as e:
self.logger.error(f"Error in stats task: {e}")
async def export_data(self, format_type: str = "csv", output_path: str = None):
"""Export collected data."""
if output_path is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"data/export_{timestamp}.{format_type}"
return stats if format_type == "csv":
await self.database.export_to_csv(output_path)
else:
raise ValueError(f"Unsupported export format: {format_type}")
self.logger.info(f"Data exported to {output_path}")
async def get_user_info(self, user_id: int) -> Optional[UserData]:
"""Get information about a specific user."""
return await self.database.get_user(user_id)
async def get_server_users(self, server_id: int) -> list:
"""Get all users from a specific server."""
return await self.database.get_users_by_server(server_id)
async def close(self):
"""Clean shutdown."""
# Cancel background tasks
self.cleanup_task.cancel()
self.stats_task.cancel()
# Close parent client
await super().close()
self.logger.info("Discord client closed")

94
test_imports.py Normal file
View file

@ -0,0 +1,94 @@
#!/usr/bin/env python3
"""
Test script to verify all imports work correctly
"""
import sys
import traceback
def test_import(module_name, import_statement):
"""Test a specific import."""
try:
exec(import_statement)
print(f"{module_name}: OK")
return True
except ImportError as e:
print(f"{module_name}: {e}")
return False
except Exception as e:
print(f"{module_name}: Unexpected error - {e}")
return False
def main():
"""Test all required imports."""
print("🧪 Testing Discord Data Collector Imports")
print("=" * 50)
tests = [
("discord.py-self", "import discord"),
("toml", "import toml"),
("python-dotenv", "from dotenv import load_dotenv"),
("pathlib", "from pathlib import Path"),
("asyncio", "import asyncio"),
("logging", "import logging"),
("datetime", "from datetime import datetime"),
("json", "import json"),
("dataclasses", "from dataclasses import dataclass, asdict"),
("collections", "from collections import deque"),
("time", "import time"),
("typing", "from typing import Optional, Set, Dict, List, Any"),
]
failed = 0
for module_name, import_statement in tests:
if not test_import(module_name, import_statement):
failed += 1
print(f"\n📊 Results: {len(tests) - failed}/{len(tests)} imports successful")
if failed == 0:
print("✅ All imports successful! Testing local modules...")
# Test local modules
try:
# Add current directory to path
sys.path.insert(0, '.')
# Test config
from src.config import Config
print("✅ src.config: OK")
# Test database
from src.database import JSONDatabase, UserData
print("✅ src.database: OK")
# Test rate limiter
from src.rate_limiter import RateLimiter
print("✅ src.rate_limiter: OK")
# Test logger
from src.logger import setup_logger
print("✅ src.logger: OK")
# Test client
from src.client import DiscordDataClient
print("✅ src.client: OK")
print("\n🎉 All tests passed! The application should work correctly.")
except Exception as e:
print(f"❌ Local module test failed: {e}")
print("\nDetailed error:")
traceback.print_exc()
return False
else:
print(f"\n{failed} import(s) failed. Please install missing dependencies:")
print("pip install discord.py-self python-dotenv toml colorlog")
return False
return True
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)