From 8edda894db5c34edada33cc3b1356665649b691b Mon Sep 17 00:00:00 2001 From: Xargana Date: Sun, 13 Jul 2025 21:04:53 +0300 Subject: [PATCH] Initial commit v2 --- .env.example | 13 +++ cli.py | 182 ++++++++++++++++++++++++++++++++++++++ config.toml | 33 +++++++ main.py | 50 +++++++++++ readme.md | 208 ++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 20 +++++ src/__init__.py | 43 +++++++++ src/client.py | 205 +++++++++++++++++++++++++++++++++++++++++++ src/config.py | 121 ++++++++++++++++++++++++++ src/database.py | 205 +++++++++++++++++++++++++++++++++++++++++++ src/logger.py | 61 +++++++++++++ src/rate_limiter.py | 70 +++++++++++++++ 12 files changed, 1211 insertions(+) create mode 100644 .env.example create mode 100644 cli.py create mode 100644 config.toml create mode 100644 main.py create mode 100644 readme.md create mode 100644 requirements.txt create mode 100644 src/__init__.py create mode 100644 src/client.py create mode 100644 src/config.py create mode 100644 src/database.py create mode 100644 src/logger.py create mode 100644 src/rate_limiter.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..4d10986 --- /dev/null +++ b/.env.example @@ -0,0 +1,13 @@ +# Discord Data Collector Environment Variables +# Copy this file to .env and fill in your values + +# Discord user token (REQUIRED) +# WARNING: This should be your user token, not a bot token +# Keep this secret and never share it publicly +DISCORD_TOKEN=your_discord_user_token_here + +# Optional: Database connection string for future MongoDB integration +# MONGODB_URI=mongodb://localhost:27017/discord_research + +# Optional: Additional API keys for extended functionality +# BACKUP_WEBHOOK_URL=https://discord.com/api/webhooks/your_webhook_url \ No newline at end of file diff --git a/cli.py b/cli.py new file mode 100644 index 0000000..1b1bf46 --- /dev/null +++ b/cli.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +""" +Command-line interface for Discord Data Collector. +""" + +import argparse +import asyncio +import json +import sys +from pathlib import Path + +# Add src to path +sys.path.append(str(Path(__file__).parent)) + +from src.config import Config +from src.database import JSONDatabase +from src.client import DiscordDataClient + + +async def export_data(format_type: str, output_path: str = None): + """Export collected data.""" + config = Config() + database = JSONDatabase(config.database_path) + + if output_path is None: + from datetime import datetime + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = f"data/export_{timestamp}.{format_type}" + + if format_type == "csv": + await database.export_to_csv(output_path) + print(f"Data exported to {output_path}") + else: + print(f"Unsupported format: {format_type}") + + +async def show_stats(): + """Show database statistics.""" + config = Config() + database = JSONDatabase(config.database_path) + + stats = await database.get_statistics() + + print("\n=== Database Statistics ===") + print(f"Total users: {stats['total_users']}") + print(f"Total servers: {stats['total_servers']}") + print(f"Database size: {stats['database_size']} bytes") + + if stats['most_active_servers']: + print("\nMost active servers:") + for server_id, user_count in stats['most_active_servers'][:5]: + print(f" Server {server_id}: {user_count} users") + + +async def search_user(query: str): + """Search for users.""" + config = Config() + database = JSONDatabase(config.database_path) + + all_users = await database.get_all_users() + + # Search by username or user ID + results = [] + for user in all_users: + if (query.lower() in user.username.lower() or + query.lower() in (user.display_name or "").lower() or + query == str(user.user_id)): + results.append(user) + + if not results: + print("No users found matching the query.") + return + + print(f"\n=== Found {len(results)} users ===") + for user in results[:10]: # Show first 10 results + print(f"{user.username}#{user.discriminator} (ID: {user.user_id})") + if user.display_name: + print(f" Display name: {user.display_name}") + if user.bio: + print(f" Bio: {user.bio[:100]}...") + print(f" Servers: {len(user.servers)}") + print(f" Last updated: {user.updated_at}") + print() + + +async def backup_database(): + """Create a manual backup of the database.""" + config = Config() + database = JSONDatabase(config.database_path) + + from datetime import datetime + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + backup_path = f"data/backups/manual_backup_{timestamp}.json" + + # Copy current database + import shutil + shutil.copy2(database.database_path, backup_path) + + print(f"Database backed up to {backup_path}") + + +async def cleanup_data(): + """Clean up old data and backups.""" + config = Config() + database = JSONDatabase(config.database_path) + + await database.cleanup_old_backups(max_backups=5) + print("Cleanup completed") + + +async def test_connection(): + """Test Discord connection.""" + try: + config = Config() + database = JSONDatabase(config.database_path) + client = DiscordDataClient(config, database) + + print("Testing Discord connection...") + + # This will test the connection without starting the full bot + await client.login(config.discord_token) + user_info = client.user + + print(f"✓ Successfully connected as {user_info.name}#{user_info.discriminator}") + print(f"✓ User ID: {user_info.id}") + + await client.close() + + except Exception as e: + print(f"✗ Connection failed: {e}") + sys.exit(1) + + +def main(): + """Main CLI entry point.""" + parser = argparse.ArgumentParser(description="Discord Data Collector CLI") + subparsers = parser.add_subparsers(dest="command", help="Available commands") + + # Export command + export_parser = subparsers.add_parser("export", help="Export collected data") + export_parser.add_argument("format", choices=["csv"], help="Export format") + export_parser.add_argument("-o", "--output", help="Output file path") + + # Stats command + subparsers.add_parser("stats", help="Show database statistics") + + # Search command + search_parser = subparsers.add_parser("search", help="Search for users") + search_parser.add_argument("query", help="Search query (username or user ID)") + + # Backup command + subparsers.add_parser("backup", help="Create manual database backup") + + # Cleanup command + subparsers.add_parser("cleanup", help="Clean up old data and backups") + + # Test command + subparsers.add_parser("test", help="Test Discord connection") + + args = parser.parse_args() + + if not args.command: + parser.print_help() + return + + # Run the appropriate command + if args.command == "export": + asyncio.run(export_data(args.format, args.output)) + elif args.command == "stats": + asyncio.run(show_stats()) + elif args.command == "search": + asyncio.run(search_user(args.query)) + elif args.command == "backup": + asyncio.run(backup_database()) + elif args.command == "cleanup": + asyncio.run(cleanup_data()) + elif args.command == "test": + asyncio.run(test_connection()) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..e9c3209 --- /dev/null +++ b/config.toml @@ -0,0 +1,33 @@ +# Discord Data Collector Configuration + +[database] +# JSON database file path +path = "data/users.json" +# Backup interval in seconds (3600 = 1 hour) +backup_interval = 3600 + +[collection] +# What data to collect +profile_pictures = true +bio = true +status = true +server_membership = true + +[rate_limiting] +# Delay between API requests in seconds +request_delay = 1.0 +# Maximum requests per minute +max_requests_per_minute = 30 + +[monitoring] +# List of specific server IDs to monitor (leave empty to monitor all) +# Example: target_servers = [123456789, 987654321] +target_servers = [] +# Monitor all servers the account is in +monitor_all_servers = true + +[logging] +# Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL +level = "INFO" +# Log file path +file = "logs/collector.log" \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..d9a2a78 --- /dev/null +++ b/main.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +""" +Discord User Data Collector +Main application entry point for collecting Discord user data for research purposes. +""" + +import asyncio +import logging +import sys +from pathlib import Path + +from src.client import DiscordDataClient +from src.config import Config +from src.database import JSONDatabase +from src.logger import setup_logger + + +async def main(): + """Main application entry point.""" + try: + # Setup configuration + config = Config() + + # Setup logging + logger = setup_logger(config.log_level, config.log_file) + logger.info("Starting Discord Data Collector") + + # Initialize database + database = JSONDatabase(config.database_path) + + # Initialize Discord client + client = DiscordDataClient(config, database) + + # Start the client + logger.info("Starting Discord client...") + await client.start(config.discord_token) + + except KeyboardInterrupt: + logger.info("Received keyboard interrupt, shutting down...") + except Exception as e: + logger.error(f"Fatal error: {e}", exc_info=True) + sys.exit(1) + finally: + if 'client' in locals(): + await client.close() + logger.info("Application shutdown complete") + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..dc2afab --- /dev/null +++ b/readme.md @@ -0,0 +1,208 @@ +# ⚠️ Warning! ai slop. + + +# Discord Data Collector + +A Python application for collecting Discord user data for research purposes, specifically designed to study information propagation patterns in Discord communities. + +## Important Disclaimers + +- **Terms of Service**: This application uses self-botting, which violates Discord's Terms of Service and may result in account suspension. +- **Educational Use Only**: This tool is intended solely for educational and research purposes. +- **Privacy Considerations**: Always respect user privacy and obtain proper consent when collecting data. +- **Legal Compliance**: Ensure compliance with applicable data protection laws (GDPR, CCPA, etc.). + +## Features + +- **User Data Collection**: Automatically collects usernames, profile pictures, bios, status, and server memberships +- **Message Monitoring**: Processes messages from monitored servers to identify active users +- **Rate Limiting**: Built-in rate limiting to avoid hitting Discord API limits +- **Flexible Configuration**: Easy configuration via TOML and environment files +- **Data Export**: Export collected data to CSV format +- **Database Management**: JSON-based storage with automatic backups +- **CLI Tools**: Command-line interface for data management and analysis + +## Installation + +1. **Clone the repository**: + ```bash + git clone + cd discord-data-collector + ``` + +2. **Install dependencies**: + ```bash + pip install -r requirements.txt + ``` + +3. **Create configuration files**: + ```bash + cp .env.example .env + # Edit .env with your Discord token + ``` + +4. **Configure settings**: + - Edit `config.toml` to adjust collection settings + - Add your Discord user token to `.env` + +## Configuration + +### Environment Variables (.env) + +```env +# Your Discord user token (REQUIRED) +DISCORD_TOKEN=your_discord_user_token_here +``` + +### Configuration File (config.toml) + +```toml +[database] +path = "data/users.json" +backup_interval = 3600 + +[collection] +profile_pictures = true +bio = true +status = true +server_membership = true + +[rate_limiting] +request_delay = 1.0 +max_requests_per_minute = 30 + +[monitoring] +target_servers = [] # Empty = monitor all servers +monitor_all_servers = true + +[logging] +level = "INFO" +file = "logs/collector.log" +``` + +## Usage + +### Running the Collector + +```bash +# Start the data collector +python main.py +``` + +### CLI Commands + +```bash +# Show database statistics +python cli.py stats + +# Search for users +python cli.py search "username" + +# Export data to CSV +python cli.py export csv -o exported_data.csv + +# Test Discord connection +python cli.py test + +# Create manual backup +python cli.py backup + +# Clean up old backups +python cli.py cleanup +``` + +## Project Structure + +``` +discord-data-collector/ +├── main.py # Main application entry point +├── cli.py # Command-line interface +├── config.toml # Configuration file +├── .env # Environment variables +├── requirements.txt # Python dependencies +├── src/ +│ ├── __init__.py +│ ├── client.py # Discord client implementation +│ ├── config.py # Configuration management +│ ├── database.py # JSON database manager +│ ├── rate_limiter.py # Rate limiting utilities +│ └── logger.py # Logging setup +├── data/ +│ ├── users.json # User database +│ └── backups/ # Database backups +└── logs/ + └── collector.log # Application logs +``` + +## Data Structure + +Each user entry contains: + +```json +{ + "user_id": 123456789, + "username": "example_user", + "discriminator": "1234", + "display_name": "Example User", + "avatar_url": "https://cdn.discordapp.com/avatars/...", + "banner_url": "https://cdn.discordapp.com/banners/...", + "bio": "User's about me section", + "status": "online", + "activity": "Playing a game", + "servers": [111111111, 222222222], + "created_at": "2024-01-01T00:00:00", + "updated_at": "2024-01-01T12:00:00" +} +``` + +## Features in Detail + +### Rate Limiting +- Configurable request delays +- Per-minute request limits +- Automatic backoff on rate limit hits + +### Data Collection +- Real-time message monitoring +- Member list scanning +- Profile updates tracking +- Server membership tracking + +### Database Management +- Automatic backups +- Data deduplication +- Export capabilities +- Statistics generation + +### Logging +- Configurable log levels +- File rotation +- Separate Discord.py logging + +## Future Enhancements + +- MongoDB integration for better scalability +- Web dashboard for data visualization +- Advanced search and filtering +- Data analysis tools +- Network analysis features + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Make your changes +4. Add tests if applicable +5. Submit a pull request + +## License + +This project is for educational purposes only. Use responsibly and in compliance with applicable laws and terms of service. + +## Support + +For issues or questions, please create an issue in the repository. + +--- + +**Remember**: This tool is for educational research only. Always respect user privacy and platform terms of service. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cf0a2b2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,20 @@ +# Discord Data Collector Requirements + +# Discord self-bot library +discord.py-self>=2.0.0 + +# Configuration management +python-dotenv>=1.0.0 +toml>=0.10.2 + +# Database (for future MongoDB integration) +pymongo>=4.0.0 + +# Async utilities +asyncio-throttle>=1.0.0 + +# Data processing +pandas>=1.5.0 + +# Logging +colorlog>=6.0.0 \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..6331612 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,43 @@ +""" +Discord Data Collector - A tool for collecting Discord user data for research purposes. + +This package provides functionality to collect user data from Discord servers +for academic research, particularly focused on studying information propagation +and community dynamics. + +Components: +- client: Discord client implementation +- config: Configuration management +- database: Data storage and management +- rate_limiter: API rate limiting +- logger: Logging utilities + +Usage: + from src.client import DiscordDataClient + from src.config import Config + from src.database import JSONDatabase + + config = Config() + database = JSONDatabase(config.database_path) + client = DiscordDataClient(config, database) +""" + +__version__ = "1.0.0" +__author__ = "Research Team" +__description__ = "Discord Data Collector for Research Purposes" + +# Import main classes for easier access +from .client import DiscordDataClient +from .config import Config +from .database import JSONDatabase, UserData +from .rate_limiter import RateLimiter +from .logger import setup_logger + +__all__ = [ + 'DiscordDataClient', + 'Config', + 'JSONDatabase', + 'UserData', + 'RateLimiter', + 'setup_logger' +] \ No newline at end of file diff --git a/src/client.py b/src/client.py new file mode 100644 index 0000000..24ac105 --- /dev/null +++ b/src/client.py @@ -0,0 +1,205 @@ +""" +JSON database manager for Discord user data storage. +""" + +import json +import asyncio +import shutil +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional, Any +from dataclasses import dataclass, asdict +import logging + + +@dataclass +class UserData: + """Data structure for storing user information.""" + user_id: int + username: str + discriminator: str + display_name: Optional[str] = None + avatar_url: Optional[str] = None + banner_url: Optional[str] = None + bio: Optional[str] = None + status: Optional[str] = None + activity: Optional[str] = None + servers: List[int] = None + created_at: str = None + updated_at: str = None + + def __post_init__(self): + if self.servers is None: + self.servers = [] + + current_time = datetime.utcnow().isoformat() + if self.created_at is None: + self.created_at = current_time + self.updated_at = current_time + + +class JSONDatabase: + """JSON-based database for storing Discord user data.""" + + def __init__(self, database_path: str): + """Initialize the JSON database.""" + self.database_path = Path(database_path) + self.backup_path = Path("data/backups") + self.logger = logging.getLogger(__name__) + self._lock = asyncio.Lock() + self._data: Dict[str, Dict] = {} + + # Ensure database directory exists + self.database_path.parent.mkdir(parents=True, exist_ok=True) + self.backup_path.mkdir(parents=True, exist_ok=True) + + # Load existing data + self._load_data() + + def _load_data(self): + """Load data from JSON file.""" + if self.database_path.exists(): + try: + with open(self.database_path, 'r', encoding='utf-8') as f: + self._data = json.load(f) + self.logger.info(f"Loaded {len(self._data)} users from database") + except Exception as e: + self.logger.error(f"Error loading database: {e}") + self._data = {} + else: + self._data = {} + self.logger.info("Created new database") + + async def _save_data(self): + """Save data to JSON file.""" + async with self._lock: + try: + # Create backup before saving + if self.database_path.exists(): + backup_filename = f"users_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + backup_path = self.backup_path / backup_filename + shutil.copy2(self.database_path, backup_path) + + # Save data + with open(self.database_path, 'w', encoding='utf-8') as f: + json.dump(self._data, f, indent=2, ensure_ascii=False) + + self.logger.debug(f"Saved {len(self._data)} users to database") + + except Exception as e: + self.logger.error(f"Error saving database: {e}") + + async def get_user(self, user_id: int) -> Optional[UserData]: + """Get user data by ID.""" + user_key = str(user_id) + if user_key in self._data: + user_dict = self._data[user_key] + return UserData(**user_dict) + return None + + async def save_user(self, user_data: UserData): + """Save or update user data.""" + user_key = str(user_data.user_id) + + # If user exists, preserve created_at timestamp + if user_key in self._data: + user_data.created_at = self._data[user_key]['created_at'] + + # Update timestamp + user_data.updated_at = datetime.utcnow().isoformat() + + # Save to memory + self._data[user_key] = asdict(user_data) + + # Save to disk + await self._save_data() + + self.logger.debug(f"Saved user {user_data.username}#{user_data.discriminator} ({user_data.user_id})") + + async def add_server_to_user(self, user_id: int, server_id: int): + """Add a server to user's server list.""" + user_key = str(user_id) + if user_key in self._data: + if server_id not in self._data[user_key]['servers']: + self._data[user_key]['servers'].append(server_id) + self._data[user_key]['updated_at'] = datetime.utcnow().isoformat() + await self._save_data() + + async def get_all_users(self) -> List[UserData]: + """Get all users from the database.""" + return [UserData(**user_dict) for user_dict in self._data.values()] + + async def get_users_by_server(self, server_id: int) -> List[UserData]: + """Get all users that are members of a specific server.""" + users = [] + for user_dict in self._data.values(): + if server_id in user_dict.get('servers', []): + users.append(UserData(**user_dict)) + return users + + async def get_user_count(self) -> int: + """Get total number of users in database.""" + return len(self._data) + + async def get_server_count(self) -> int: + """Get total number of unique servers.""" + servers = set() + for user_dict in self._data.values(): + servers.update(user_dict.get('servers', [])) + return len(servers) + + async def cleanup_old_backups(self, max_backups: int = 10): + """Clean up old backup files, keeping only the most recent ones.""" + backup_files = sorted(self.backup_path.glob("users_backup_*.json")) + + if len(backup_files) > max_backups: + files_to_remove = backup_files[:-max_backups] + for file_path in files_to_remove: + try: + file_path.unlink() + self.logger.info(f"Removed old backup: {file_path.name}") + except Exception as e: + self.logger.error(f"Error removing backup {file_path.name}: {e}") + + async def export_to_csv(self, output_path: str): + """Export user data to CSV format.""" + import csv + + output_path = Path(output_path) + + try: + with open(output_path, 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['user_id', 'username', 'discriminator', 'display_name', + 'avatar_url', 'bio', 'status', 'servers', 'created_at', 'updated_at'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + + writer.writeheader() + for user_dict in self._data.values(): + # Convert servers list to string + user_dict_copy = user_dict.copy() + user_dict_copy['servers'] = ','.join(map(str, user_dict.get('servers', []))) + writer.writerow(user_dict_copy) + + self.logger.info(f"Exported {len(self._data)} users to {output_path}") + + except Exception as e: + self.logger.error(f"Error exporting to CSV: {e}") + + async def get_statistics(self) -> Dict[str, Any]: + """Get database statistics.""" + stats = { + 'total_users': await self.get_user_count(), + 'total_servers': await self.get_server_count(), + 'database_size': self.database_path.stat().st_size if self.database_path.exists() else 0 + } + + # Most active servers + server_counts = {} + for user_dict in self._data.values(): + for server_id in user_dict.get('servers', []): + server_counts[server_id] = server_counts.get(server_id, 0) + 1 + + stats['most_active_servers'] = sorted(server_counts.items(), + key=lambda x: x[1], reverse=True)[:10] + + return stats \ No newline at end of file diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..c83b23f --- /dev/null +++ b/src/config.py @@ -0,0 +1,121 @@ +""" +Configuration management for Discord Data Collector. +""" + +import os +import toml +from pathlib import Path +from typing import List, Optional +from dotenv import load_dotenv + + +class Config: + """Configuration manager for the Discord Data Collector.""" + + def __init__(self, config_file: str = "config.toml"): + """Initialize configuration from TOML file and environment variables.""" + + # Load environment variables from .env file + load_dotenv() + + # Load TOML configuration + self.config_file = Path(config_file) + self.config_data = self._load_config() + + # Discord settings + self.discord_token = os.getenv("DISCORD_TOKEN") + if not self.discord_token: + raise ValueError("DISCORD_TOKEN environment variable is required") + + # Database settings + self.database_path = self.config_data.get("database", {}).get("path", "data/users.json") + self.backup_interval = self.config_data.get("database", {}).get("backup_interval", 3600) + + # Collection settings + collection_config = self.config_data.get("collection", {}) + self.collect_profile_pics = collection_config.get("profile_pictures", True) + self.collect_bio = collection_config.get("bio", True) + self.collect_status = collection_config.get("status", True) + self.collect_server_membership = collection_config.get("server_membership", True) + + # Rate limiting settings + rate_limit_config = self.config_data.get("rate_limiting", {}) + self.request_delay = rate_limit_config.get("request_delay", 1.0) + self.max_requests_per_minute = rate_limit_config.get("max_requests_per_minute", 30) + + # Monitoring settings + monitoring_config = self.config_data.get("monitoring", {}) + self.target_servers = monitoring_config.get("target_servers", []) + self.monitor_all_servers = monitoring_config.get("monitor_all_servers", True) + + # Logging settings + logging_config = self.config_data.get("logging", {}) + self.log_level = logging_config.get("level", "INFO") + self.log_file = logging_config.get("file", "logs/collector.log") + + # Ensure directories exist + self._ensure_directories() + + def _load_config(self) -> dict: + """Load configuration from TOML file.""" + if not self.config_file.exists(): + self._create_default_config() + + try: + with open(self.config_file, 'r') as f: + return toml.load(f) + except Exception as e: + print(f"Error loading config file: {e}") + return {} + + def _create_default_config(self): + """Create a default configuration file.""" + default_config = { + "database": { + "path": "data/users.json", + "backup_interval": 3600 + }, + "collection": { + "profile_pictures": True, + "bio": True, + "status": True, + "server_membership": True + }, + "rate_limiting": { + "request_delay": 1.0, + "max_requests_per_minute": 30 + }, + "monitoring": { + "target_servers": [], + "monitor_all_servers": True + }, + "logging": { + "level": "INFO", + "file": "logs/collector.log" + } + } + + # Create directory if it doesn't exist + self.config_file.parent.mkdir(parents=True, exist_ok=True) + + with open(self.config_file, 'w') as f: + toml.dump(default_config, f) + + print(f"Created default configuration file: {self.config_file}") + + def _ensure_directories(self): + """Ensure required directories exist.""" + directories = [ + Path(self.database_path).parent, + Path(self.log_file).parent, + Path("data/backups") + ] + + for directory in directories: + directory.mkdir(parents=True, exist_ok=True) + + def get_target_servers(self) -> List[int]: + """Get list of target server IDs.""" + if self.monitor_all_servers: + return [] + return [int(server_id) for server_id in self.target_servers] \ No newline at end of file diff --git a/src/database.py b/src/database.py new file mode 100644 index 0000000..24ac105 --- /dev/null +++ b/src/database.py @@ -0,0 +1,205 @@ +""" +JSON database manager for Discord user data storage. +""" + +import json +import asyncio +import shutil +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional, Any +from dataclasses import dataclass, asdict +import logging + + +@dataclass +class UserData: + """Data structure for storing user information.""" + user_id: int + username: str + discriminator: str + display_name: Optional[str] = None + avatar_url: Optional[str] = None + banner_url: Optional[str] = None + bio: Optional[str] = None + status: Optional[str] = None + activity: Optional[str] = None + servers: List[int] = None + created_at: str = None + updated_at: str = None + + def __post_init__(self): + if self.servers is None: + self.servers = [] + + current_time = datetime.utcnow().isoformat() + if self.created_at is None: + self.created_at = current_time + self.updated_at = current_time + + +class JSONDatabase: + """JSON-based database for storing Discord user data.""" + + def __init__(self, database_path: str): + """Initialize the JSON database.""" + self.database_path = Path(database_path) + self.backup_path = Path("data/backups") + self.logger = logging.getLogger(__name__) + self._lock = asyncio.Lock() + self._data: Dict[str, Dict] = {} + + # Ensure database directory exists + self.database_path.parent.mkdir(parents=True, exist_ok=True) + self.backup_path.mkdir(parents=True, exist_ok=True) + + # Load existing data + self._load_data() + + def _load_data(self): + """Load data from JSON file.""" + if self.database_path.exists(): + try: + with open(self.database_path, 'r', encoding='utf-8') as f: + self._data = json.load(f) + self.logger.info(f"Loaded {len(self._data)} users from database") + except Exception as e: + self.logger.error(f"Error loading database: {e}") + self._data = {} + else: + self._data = {} + self.logger.info("Created new database") + + async def _save_data(self): + """Save data to JSON file.""" + async with self._lock: + try: + # Create backup before saving + if self.database_path.exists(): + backup_filename = f"users_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + backup_path = self.backup_path / backup_filename + shutil.copy2(self.database_path, backup_path) + + # Save data + with open(self.database_path, 'w', encoding='utf-8') as f: + json.dump(self._data, f, indent=2, ensure_ascii=False) + + self.logger.debug(f"Saved {len(self._data)} users to database") + + except Exception as e: + self.logger.error(f"Error saving database: {e}") + + async def get_user(self, user_id: int) -> Optional[UserData]: + """Get user data by ID.""" + user_key = str(user_id) + if user_key in self._data: + user_dict = self._data[user_key] + return UserData(**user_dict) + return None + + async def save_user(self, user_data: UserData): + """Save or update user data.""" + user_key = str(user_data.user_id) + + # If user exists, preserve created_at timestamp + if user_key in self._data: + user_data.created_at = self._data[user_key]['created_at'] + + # Update timestamp + user_data.updated_at = datetime.utcnow().isoformat() + + # Save to memory + self._data[user_key] = asdict(user_data) + + # Save to disk + await self._save_data() + + self.logger.debug(f"Saved user {user_data.username}#{user_data.discriminator} ({user_data.user_id})") + + async def add_server_to_user(self, user_id: int, server_id: int): + """Add a server to user's server list.""" + user_key = str(user_id) + if user_key in self._data: + if server_id not in self._data[user_key]['servers']: + self._data[user_key]['servers'].append(server_id) + self._data[user_key]['updated_at'] = datetime.utcnow().isoformat() + await self._save_data() + + async def get_all_users(self) -> List[UserData]: + """Get all users from the database.""" + return [UserData(**user_dict) for user_dict in self._data.values()] + + async def get_users_by_server(self, server_id: int) -> List[UserData]: + """Get all users that are members of a specific server.""" + users = [] + for user_dict in self._data.values(): + if server_id in user_dict.get('servers', []): + users.append(UserData(**user_dict)) + return users + + async def get_user_count(self) -> int: + """Get total number of users in database.""" + return len(self._data) + + async def get_server_count(self) -> int: + """Get total number of unique servers.""" + servers = set() + for user_dict in self._data.values(): + servers.update(user_dict.get('servers', [])) + return len(servers) + + async def cleanup_old_backups(self, max_backups: int = 10): + """Clean up old backup files, keeping only the most recent ones.""" + backup_files = sorted(self.backup_path.glob("users_backup_*.json")) + + if len(backup_files) > max_backups: + files_to_remove = backup_files[:-max_backups] + for file_path in files_to_remove: + try: + file_path.unlink() + self.logger.info(f"Removed old backup: {file_path.name}") + except Exception as e: + self.logger.error(f"Error removing backup {file_path.name}: {e}") + + async def export_to_csv(self, output_path: str): + """Export user data to CSV format.""" + import csv + + output_path = Path(output_path) + + try: + with open(output_path, 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['user_id', 'username', 'discriminator', 'display_name', + 'avatar_url', 'bio', 'status', 'servers', 'created_at', 'updated_at'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + + writer.writeheader() + for user_dict in self._data.values(): + # Convert servers list to string + user_dict_copy = user_dict.copy() + user_dict_copy['servers'] = ','.join(map(str, user_dict.get('servers', []))) + writer.writerow(user_dict_copy) + + self.logger.info(f"Exported {len(self._data)} users to {output_path}") + + except Exception as e: + self.logger.error(f"Error exporting to CSV: {e}") + + async def get_statistics(self) -> Dict[str, Any]: + """Get database statistics.""" + stats = { + 'total_users': await self.get_user_count(), + 'total_servers': await self.get_server_count(), + 'database_size': self.database_path.stat().st_size if self.database_path.exists() else 0 + } + + # Most active servers + server_counts = {} + for user_dict in self._data.values(): + for server_id in user_dict.get('servers', []): + server_counts[server_id] = server_counts.get(server_id, 0) + 1 + + stats['most_active_servers'] = sorted(server_counts.items(), + key=lambda x: x[1], reverse=True)[:10] + + return stats \ No newline at end of file diff --git a/src/logger.py b/src/logger.py new file mode 100644 index 0000000..3a3ec31 --- /dev/null +++ b/src/logger.py @@ -0,0 +1,61 @@ +""" +Logging setup for Discord Data Collector. +""" + +import logging +import sys +from pathlib import Path +from logging.handlers import RotatingFileHandler + + +def setup_logger(log_level: str = "INFO", log_file: str = "logs/collector.log") -> logging.Logger: + """ + Setup logging configuration. + + Args: + log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + log_file: Path to log file + + Returns: + Configured logger instance + """ + + # Create logs directory if it doesn't exist + log_path = Path(log_file) + log_path.parent.mkdir(parents=True, exist_ok=True) + + # Create formatter + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + # Setup root logger + root_logger = logging.getLogger() + root_logger.setLevel(getattr(logging, log_level.upper())) + + # Clear existing handlers + root_logger.handlers.clear() + + # Console handler + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(getattr(logging, log_level.upper())) + console_handler.setFormatter(formatter) + root_logger.addHandler(console_handler) + + # File handler with rotation + file_handler = RotatingFileHandler( + log_file, + maxBytes=10*1024*1024, # 10MB + backupCount=5 + ) + file_handler.setLevel(getattr(logging, log_level.upper())) + file_handler.setFormatter(formatter) + root_logger.addHandler(file_handler) + + # Set specific logger levels for discord.py to reduce noise + logging.getLogger('discord').setLevel(logging.WARNING) + logging.getLogger('discord.http').setLevel(logging.WARNING) + logging.getLogger('discord.gateway').setLevel(logging.WARNING) + + return root_logger \ No newline at end of file diff --git a/src/rate_limiter.py b/src/rate_limiter.py new file mode 100644 index 0000000..44629c3 --- /dev/null +++ b/src/rate_limiter.py @@ -0,0 +1,70 @@ +""" +Rate limiter for Discord API requests. +""" + +import asyncio +import time +from collections import deque +from typing import Optional +import logging + + +class RateLimiter: + """Rate limiter to prevent hitting Discord API limits.""" + + def __init__(self, requests_per_minute: int = 30, delay_between_requests: float = 1.0): + """ + Initialize rate limiter. + + Args: + requests_per_minute: Maximum requests per minute + delay_between_requests: Minimum delay between requests in seconds + """ + self.requests_per_minute = requests_per_minute + self.delay_between_requests = delay_between_requests + self.request_times = deque() + self.last_request_time = 0 + self.logger = logging.getLogger(__name__) + + async def wait(self): + """Wait if necessary to respect rate limits.""" + current_time = time.time() + + # Remove old requests (older than 1 minute) + while self.request_times and current_time - self.request_times[0] > 60: + self.request_times.popleft() + + # Check if we're at the rate limit + if len(self.request_times) >= self.requests_per_minute: + # Wait until the oldest request is more than 1 minute old + wait_time = 60 - (current_time - self.request_times[0]) + if wait_time > 0: + self.logger.debug(f"Rate limit reached, waiting {wait_time:.2f} seconds") + await asyncio.sleep(wait_time) + current_time = time.time() + + # Check minimum delay between requests + time_since_last = current_time - self.last_request_time + if time_since_last < self.delay_between_requests: + wait_time = self.delay_between_requests - time_since_last + await asyncio.sleep(wait_time) + current_time = time.time() + + # Record this request + self.request_times.append(current_time) + self.last_request_time = current_time + + def get_stats(self) -> dict: + """Get rate limiter statistics.""" + current_time = time.time() + + # Clean old requests + while self.request_times and current_time - self.request_times[0] > 60: + self.request_times.popleft() + + return { + 'requests_last_minute': len(self.request_times), + 'requests_per_minute_limit': self.requests_per_minute, + 'delay_between_requests': self.delay_between_requests, + 'time_since_last_request': current_time - self.last_request_time + } \ No newline at end of file