Initial commit v2

This commit is contained in:
Xargana 2025-07-13 21:04:53 +03:00
parent 5b961b3853
commit 8edda894db
12 changed files with 1211 additions and 0 deletions

13
.env.example Normal file
View file

@ -0,0 +1,13 @@
# Discord Data Collector Environment Variables
# Copy this file to .env and fill in your values
# Discord user token (REQUIRED)
# WARNING: This should be your user token, not a bot token
# Keep this secret and never share it publicly
DISCORD_TOKEN=your_discord_user_token_here
# Optional: Database connection string for future MongoDB integration
# MONGODB_URI=mongodb://localhost:27017/discord_research
# Optional: Additional API keys for extended functionality
# BACKUP_WEBHOOK_URL=https://discord.com/api/webhooks/your_webhook_url

182
cli.py Normal file
View file

@ -0,0 +1,182 @@
#!/usr/bin/env python3
"""
Command-line interface for Discord Data Collector.
"""
import argparse
import asyncio
import json
import sys
from pathlib import Path
# Add src to path
sys.path.append(str(Path(__file__).parent))
from src.config import Config
from src.database import JSONDatabase
from src.client import DiscordDataClient
async def export_data(format_type: str, output_path: str = None):
"""Export collected data."""
config = Config()
database = JSONDatabase(config.database_path)
if output_path is None:
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"data/export_{timestamp}.{format_type}"
if format_type == "csv":
await database.export_to_csv(output_path)
print(f"Data exported to {output_path}")
else:
print(f"Unsupported format: {format_type}")
async def show_stats():
"""Show database statistics."""
config = Config()
database = JSONDatabase(config.database_path)
stats = await database.get_statistics()
print("\n=== Database Statistics ===")
print(f"Total users: {stats['total_users']}")
print(f"Total servers: {stats['total_servers']}")
print(f"Database size: {stats['database_size']} bytes")
if stats['most_active_servers']:
print("\nMost active servers:")
for server_id, user_count in stats['most_active_servers'][:5]:
print(f" Server {server_id}: {user_count} users")
async def search_user(query: str):
"""Search for users."""
config = Config()
database = JSONDatabase(config.database_path)
all_users = await database.get_all_users()
# Search by username or user ID
results = []
for user in all_users:
if (query.lower() in user.username.lower() or
query.lower() in (user.display_name or "").lower() or
query == str(user.user_id)):
results.append(user)
if not results:
print("No users found matching the query.")
return
print(f"\n=== Found {len(results)} users ===")
for user in results[:10]: # Show first 10 results
print(f"{user.username}#{user.discriminator} (ID: {user.user_id})")
if user.display_name:
print(f" Display name: {user.display_name}")
if user.bio:
print(f" Bio: {user.bio[:100]}...")
print(f" Servers: {len(user.servers)}")
print(f" Last updated: {user.updated_at}")
print()
async def backup_database():
"""Create a manual backup of the database."""
config = Config()
database = JSONDatabase(config.database_path)
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = f"data/backups/manual_backup_{timestamp}.json"
# Copy current database
import shutil
shutil.copy2(database.database_path, backup_path)
print(f"Database backed up to {backup_path}")
async def cleanup_data():
"""Clean up old data and backups."""
config = Config()
database = JSONDatabase(config.database_path)
await database.cleanup_old_backups(max_backups=5)
print("Cleanup completed")
async def test_connection():
"""Test Discord connection."""
try:
config = Config()
database = JSONDatabase(config.database_path)
client = DiscordDataClient(config, database)
print("Testing Discord connection...")
# This will test the connection without starting the full bot
await client.login(config.discord_token)
user_info = client.user
print(f"✓ Successfully connected as {user_info.name}#{user_info.discriminator}")
print(f"✓ User ID: {user_info.id}")
await client.close()
except Exception as e:
print(f"✗ Connection failed: {e}")
sys.exit(1)
def main():
"""Main CLI entry point."""
parser = argparse.ArgumentParser(description="Discord Data Collector CLI")
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Export command
export_parser = subparsers.add_parser("export", help="Export collected data")
export_parser.add_argument("format", choices=["csv"], help="Export format")
export_parser.add_argument("-o", "--output", help="Output file path")
# Stats command
subparsers.add_parser("stats", help="Show database statistics")
# Search command
search_parser = subparsers.add_parser("search", help="Search for users")
search_parser.add_argument("query", help="Search query (username or user ID)")
# Backup command
subparsers.add_parser("backup", help="Create manual database backup")
# Cleanup command
subparsers.add_parser("cleanup", help="Clean up old data and backups")
# Test command
subparsers.add_parser("test", help="Test Discord connection")
args = parser.parse_args()
if not args.command:
parser.print_help()
return
# Run the appropriate command
if args.command == "export":
asyncio.run(export_data(args.format, args.output))
elif args.command == "stats":
asyncio.run(show_stats())
elif args.command == "search":
asyncio.run(search_user(args.query))
elif args.command == "backup":
asyncio.run(backup_database())
elif args.command == "cleanup":
asyncio.run(cleanup_data())
elif args.command == "test":
asyncio.run(test_connection())
if __name__ == "__main__":
main()

33
config.toml Normal file
View file

@ -0,0 +1,33 @@
# Discord Data Collector Configuration
[database]
# JSON database file path
path = "data/users.json"
# Backup interval in seconds (3600 = 1 hour)
backup_interval = 3600
[collection]
# What data to collect
profile_pictures = true
bio = true
status = true
server_membership = true
[rate_limiting]
# Delay between API requests in seconds
request_delay = 1.0
# Maximum requests per minute
max_requests_per_minute = 30
[monitoring]
# List of specific server IDs to monitor (leave empty to monitor all)
# Example: target_servers = [123456789, 987654321]
target_servers = []
# Monitor all servers the account is in
monitor_all_servers = true
[logging]
# Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL
level = "INFO"
# Log file path
file = "logs/collector.log"

50
main.py Normal file
View file

@ -0,0 +1,50 @@
#!/usr/bin/env python3
"""
Discord User Data Collector
Main application entry point for collecting Discord user data for research purposes.
"""
import asyncio
import logging
import sys
from pathlib import Path
from src.client import DiscordDataClient
from src.config import Config
from src.database import JSONDatabase
from src.logger import setup_logger
async def main():
"""Main application entry point."""
try:
# Setup configuration
config = Config()
# Setup logging
logger = setup_logger(config.log_level, config.log_file)
logger.info("Starting Discord Data Collector")
# Initialize database
database = JSONDatabase(config.database_path)
# Initialize Discord client
client = DiscordDataClient(config, database)
# Start the client
logger.info("Starting Discord client...")
await client.start(config.discord_token)
except KeyboardInterrupt:
logger.info("Received keyboard interrupt, shutting down...")
except Exception as e:
logger.error(f"Fatal error: {e}", exc_info=True)
sys.exit(1)
finally:
if 'client' in locals():
await client.close()
logger.info("Application shutdown complete")
if __name__ == "__main__":
asyncio.run(main())

208
readme.md Normal file
View file

@ -0,0 +1,208 @@
# ⚠️ Warning! ai slop.
# Discord Data Collector
A Python application for collecting Discord user data for research purposes, specifically designed to study information propagation patterns in Discord communities.
## Important Disclaimers
- **Terms of Service**: This application uses self-botting, which violates Discord's Terms of Service and may result in account suspension.
- **Educational Use Only**: This tool is intended solely for educational and research purposes.
- **Privacy Considerations**: Always respect user privacy and obtain proper consent when collecting data.
- **Legal Compliance**: Ensure compliance with applicable data protection laws (GDPR, CCPA, etc.).
## Features
- **User Data Collection**: Automatically collects usernames, profile pictures, bios, status, and server memberships
- **Message Monitoring**: Processes messages from monitored servers to identify active users
- **Rate Limiting**: Built-in rate limiting to avoid hitting Discord API limits
- **Flexible Configuration**: Easy configuration via TOML and environment files
- **Data Export**: Export collected data to CSV format
- **Database Management**: JSON-based storage with automatic backups
- **CLI Tools**: Command-line interface for data management and analysis
## Installation
1. **Clone the repository**:
```bash
git clone <repository-url>
cd discord-data-collector
```
2. **Install dependencies**:
```bash
pip install -r requirements.txt
```
3. **Create configuration files**:
```bash
cp .env.example .env
# Edit .env with your Discord token
```
4. **Configure settings**:
- Edit `config.toml` to adjust collection settings
- Add your Discord user token to `.env`
## Configuration
### Environment Variables (.env)
```env
# Your Discord user token (REQUIRED)
DISCORD_TOKEN=your_discord_user_token_here
```
### Configuration File (config.toml)
```toml
[database]
path = "data/users.json"
backup_interval = 3600
[collection]
profile_pictures = true
bio = true
status = true
server_membership = true
[rate_limiting]
request_delay = 1.0
max_requests_per_minute = 30
[monitoring]
target_servers = [] # Empty = monitor all servers
monitor_all_servers = true
[logging]
level = "INFO"
file = "logs/collector.log"
```
## Usage
### Running the Collector
```bash
# Start the data collector
python main.py
```
### CLI Commands
```bash
# Show database statistics
python cli.py stats
# Search for users
python cli.py search "username"
# Export data to CSV
python cli.py export csv -o exported_data.csv
# Test Discord connection
python cli.py test
# Create manual backup
python cli.py backup
# Clean up old backups
python cli.py cleanup
```
## Project Structure
```
discord-data-collector/
├── main.py # Main application entry point
├── cli.py # Command-line interface
├── config.toml # Configuration file
├── .env # Environment variables
├── requirements.txt # Python dependencies
├── src/
│ ├── __init__.py
│ ├── client.py # Discord client implementation
│ ├── config.py # Configuration management
│ ├── database.py # JSON database manager
│ ├── rate_limiter.py # Rate limiting utilities
│ └── logger.py # Logging setup
├── data/
│ ├── users.json # User database
│ └── backups/ # Database backups
└── logs/
└── collector.log # Application logs
```
## Data Structure
Each user entry contains:
```json
{
"user_id": 123456789,
"username": "example_user",
"discriminator": "1234",
"display_name": "Example User",
"avatar_url": "https://cdn.discordapp.com/avatars/...",
"banner_url": "https://cdn.discordapp.com/banners/...",
"bio": "User's about me section",
"status": "online",
"activity": "Playing a game",
"servers": [111111111, 222222222],
"created_at": "2024-01-01T00:00:00",
"updated_at": "2024-01-01T12:00:00"
}
```
## Features in Detail
### Rate Limiting
- Configurable request delays
- Per-minute request limits
- Automatic backoff on rate limit hits
### Data Collection
- Real-time message monitoring
- Member list scanning
- Profile updates tracking
- Server membership tracking
### Database Management
- Automatic backups
- Data deduplication
- Export capabilities
- Statistics generation
### Logging
- Configurable log levels
- File rotation
- Separate Discord.py logging
## Future Enhancements
- MongoDB integration for better scalability
- Web dashboard for data visualization
- Advanced search and filtering
- Data analysis tools
- Network analysis features
## Contributing
1. Fork the repository
2. Create a feature branch
3. Make your changes
4. Add tests if applicable
5. Submit a pull request
## License
This project is for educational purposes only. Use responsibly and in compliance with applicable laws and terms of service.
## Support
For issues or questions, please create an issue in the repository.
---
**Remember**: This tool is for educational research only. Always respect user privacy and platform terms of service.

20
requirements.txt Normal file
View file

@ -0,0 +1,20 @@
# Discord Data Collector Requirements
# Discord self-bot library
discord.py-self>=2.0.0
# Configuration management
python-dotenv>=1.0.0
toml>=0.10.2
# Database (for future MongoDB integration)
pymongo>=4.0.0
# Async utilities
asyncio-throttle>=1.0.0
# Data processing
pandas>=1.5.0
# Logging
colorlog>=6.0.0

43
src/__init__.py Normal file
View file

@ -0,0 +1,43 @@
"""
Discord Data Collector - A tool for collecting Discord user data for research purposes.
This package provides functionality to collect user data from Discord servers
for academic research, particularly focused on studying information propagation
and community dynamics.
Components:
- client: Discord client implementation
- config: Configuration management
- database: Data storage and management
- rate_limiter: API rate limiting
- logger: Logging utilities
Usage:
from src.client import DiscordDataClient
from src.config import Config
from src.database import JSONDatabase
config = Config()
database = JSONDatabase(config.database_path)
client = DiscordDataClient(config, database)
"""
__version__ = "1.0.0"
__author__ = "Research Team"
__description__ = "Discord Data Collector for Research Purposes"
# Import main classes for easier access
from .client import DiscordDataClient
from .config import Config
from .database import JSONDatabase, UserData
from .rate_limiter import RateLimiter
from .logger import setup_logger
__all__ = [
'DiscordDataClient',
'Config',
'JSONDatabase',
'UserData',
'RateLimiter',
'setup_logger'
]

205
src/client.py Normal file
View file

@ -0,0 +1,205 @@
"""
JSON database manager for Discord user data storage.
"""
import json
import asyncio
import shutil
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict
import logging
@dataclass
class UserData:
"""Data structure for storing user information."""
user_id: int
username: str
discriminator: str
display_name: Optional[str] = None
avatar_url: Optional[str] = None
banner_url: Optional[str] = None
bio: Optional[str] = None
status: Optional[str] = None
activity: Optional[str] = None
servers: List[int] = None
created_at: str = None
updated_at: str = None
def __post_init__(self):
if self.servers is None:
self.servers = []
current_time = datetime.utcnow().isoformat()
if self.created_at is None:
self.created_at = current_time
self.updated_at = current_time
class JSONDatabase:
"""JSON-based database for storing Discord user data."""
def __init__(self, database_path: str):
"""Initialize the JSON database."""
self.database_path = Path(database_path)
self.backup_path = Path("data/backups")
self.logger = logging.getLogger(__name__)
self._lock = asyncio.Lock()
self._data: Dict[str, Dict] = {}
# Ensure database directory exists
self.database_path.parent.mkdir(parents=True, exist_ok=True)
self.backup_path.mkdir(parents=True, exist_ok=True)
# Load existing data
self._load_data()
def _load_data(self):
"""Load data from JSON file."""
if self.database_path.exists():
try:
with open(self.database_path, 'r', encoding='utf-8') as f:
self._data = json.load(f)
self.logger.info(f"Loaded {len(self._data)} users from database")
except Exception as e:
self.logger.error(f"Error loading database: {e}")
self._data = {}
else:
self._data = {}
self.logger.info("Created new database")
async def _save_data(self):
"""Save data to JSON file."""
async with self._lock:
try:
# Create backup before saving
if self.database_path.exists():
backup_filename = f"users_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
backup_path = self.backup_path / backup_filename
shutil.copy2(self.database_path, backup_path)
# Save data
with open(self.database_path, 'w', encoding='utf-8') as f:
json.dump(self._data, f, indent=2, ensure_ascii=False)
self.logger.debug(f"Saved {len(self._data)} users to database")
except Exception as e:
self.logger.error(f"Error saving database: {e}")
async def get_user(self, user_id: int) -> Optional[UserData]:
"""Get user data by ID."""
user_key = str(user_id)
if user_key in self._data:
user_dict = self._data[user_key]
return UserData(**user_dict)
return None
async def save_user(self, user_data: UserData):
"""Save or update user data."""
user_key = str(user_data.user_id)
# If user exists, preserve created_at timestamp
if user_key in self._data:
user_data.created_at = self._data[user_key]['created_at']
# Update timestamp
user_data.updated_at = datetime.utcnow().isoformat()
# Save to memory
self._data[user_key] = asdict(user_data)
# Save to disk
await self._save_data()
self.logger.debug(f"Saved user {user_data.username}#{user_data.discriminator} ({user_data.user_id})")
async def add_server_to_user(self, user_id: int, server_id: int):
"""Add a server to user's server list."""
user_key = str(user_id)
if user_key in self._data:
if server_id not in self._data[user_key]['servers']:
self._data[user_key]['servers'].append(server_id)
self._data[user_key]['updated_at'] = datetime.utcnow().isoformat()
await self._save_data()
async def get_all_users(self) -> List[UserData]:
"""Get all users from the database."""
return [UserData(**user_dict) for user_dict in self._data.values()]
async def get_users_by_server(self, server_id: int) -> List[UserData]:
"""Get all users that are members of a specific server."""
users = []
for user_dict in self._data.values():
if server_id in user_dict.get('servers', []):
users.append(UserData(**user_dict))
return users
async def get_user_count(self) -> int:
"""Get total number of users in database."""
return len(self._data)
async def get_server_count(self) -> int:
"""Get total number of unique servers."""
servers = set()
for user_dict in self._data.values():
servers.update(user_dict.get('servers', []))
return len(servers)
async def cleanup_old_backups(self, max_backups: int = 10):
"""Clean up old backup files, keeping only the most recent ones."""
backup_files = sorted(self.backup_path.glob("users_backup_*.json"))
if len(backup_files) > max_backups:
files_to_remove = backup_files[:-max_backups]
for file_path in files_to_remove:
try:
file_path.unlink()
self.logger.info(f"Removed old backup: {file_path.name}")
except Exception as e:
self.logger.error(f"Error removing backup {file_path.name}: {e}")
async def export_to_csv(self, output_path: str):
"""Export user data to CSV format."""
import csv
output_path = Path(output_path)
try:
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['user_id', 'username', 'discriminator', 'display_name',
'avatar_url', 'bio', 'status', 'servers', 'created_at', 'updated_at']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for user_dict in self._data.values():
# Convert servers list to string
user_dict_copy = user_dict.copy()
user_dict_copy['servers'] = ','.join(map(str, user_dict.get('servers', [])))
writer.writerow(user_dict_copy)
self.logger.info(f"Exported {len(self._data)} users to {output_path}")
except Exception as e:
self.logger.error(f"Error exporting to CSV: {e}")
async def get_statistics(self) -> Dict[str, Any]:
"""Get database statistics."""
stats = {
'total_users': await self.get_user_count(),
'total_servers': await self.get_server_count(),
'database_size': self.database_path.stat().st_size if self.database_path.exists() else 0
}
# Most active servers
server_counts = {}
for user_dict in self._data.values():
for server_id in user_dict.get('servers', []):
server_counts[server_id] = server_counts.get(server_id, 0) + 1
stats['most_active_servers'] = sorted(server_counts.items(),
key=lambda x: x[1], reverse=True)[:10]
return stats

121
src/config.py Normal file
View file

@ -0,0 +1,121 @@
"""
Configuration management for Discord Data Collector.
"""
import os
import toml
from pathlib import Path
from typing import List, Optional
from dotenv import load_dotenv
class Config:
"""Configuration manager for the Discord Data Collector."""
def __init__(self, config_file: str = "config.toml"):
"""Initialize configuration from TOML file and environment variables."""
# Load environment variables from .env file
load_dotenv()
# Load TOML configuration
self.config_file = Path(config_file)
self.config_data = self._load_config()
# Discord settings
self.discord_token = os.getenv("DISCORD_TOKEN")
if not self.discord_token:
raise ValueError("DISCORD_TOKEN environment variable is required")
# Database settings
self.database_path = self.config_data.get("database", {}).get("path", "data/users.json")
self.backup_interval = self.config_data.get("database", {}).get("backup_interval", 3600)
# Collection settings
collection_config = self.config_data.get("collection", {})
self.collect_profile_pics = collection_config.get("profile_pictures", True)
self.collect_bio = collection_config.get("bio", True)
self.collect_status = collection_config.get("status", True)
self.collect_server_membership = collection_config.get("server_membership", True)
# Rate limiting settings
rate_limit_config = self.config_data.get("rate_limiting", {})
self.request_delay = rate_limit_config.get("request_delay", 1.0)
self.max_requests_per_minute = rate_limit_config.get("max_requests_per_minute", 30)
# Monitoring settings
monitoring_config = self.config_data.get("monitoring", {})
self.target_servers = monitoring_config.get("target_servers", [])
self.monitor_all_servers = monitoring_config.get("monitor_all_servers", True)
# Logging settings
logging_config = self.config_data.get("logging", {})
self.log_level = logging_config.get("level", "INFO")
self.log_file = logging_config.get("file", "logs/collector.log")
# Ensure directories exist
self._ensure_directories()
def _load_config(self) -> dict:
"""Load configuration from TOML file."""
if not self.config_file.exists():
self._create_default_config()
try:
with open(self.config_file, 'r') as f:
return toml.load(f)
except Exception as e:
print(f"Error loading config file: {e}")
return {}
def _create_default_config(self):
"""Create a default configuration file."""
default_config = {
"database": {
"path": "data/users.json",
"backup_interval": 3600
},
"collection": {
"profile_pictures": True,
"bio": True,
"status": True,
"server_membership": True
},
"rate_limiting": {
"request_delay": 1.0,
"max_requests_per_minute": 30
},
"monitoring": {
"target_servers": [],
"monitor_all_servers": True
},
"logging": {
"level": "INFO",
"file": "logs/collector.log"
}
}
# Create directory if it doesn't exist
self.config_file.parent.mkdir(parents=True, exist_ok=True)
with open(self.config_file, 'w') as f:
toml.dump(default_config, f)
print(f"Created default configuration file: {self.config_file}")
def _ensure_directories(self):
"""Ensure required directories exist."""
directories = [
Path(self.database_path).parent,
Path(self.log_file).parent,
Path("data/backups")
]
for directory in directories:
directory.mkdir(parents=True, exist_ok=True)
def get_target_servers(self) -> List[int]:
"""Get list of target server IDs."""
if self.monitor_all_servers:
return []
return [int(server_id) for server_id in self.target_servers]

205
src/database.py Normal file
View file

@ -0,0 +1,205 @@
"""
JSON database manager for Discord user data storage.
"""
import json
import asyncio
import shutil
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict
import logging
@dataclass
class UserData:
"""Data structure for storing user information."""
user_id: int
username: str
discriminator: str
display_name: Optional[str] = None
avatar_url: Optional[str] = None
banner_url: Optional[str] = None
bio: Optional[str] = None
status: Optional[str] = None
activity: Optional[str] = None
servers: List[int] = None
created_at: str = None
updated_at: str = None
def __post_init__(self):
if self.servers is None:
self.servers = []
current_time = datetime.utcnow().isoformat()
if self.created_at is None:
self.created_at = current_time
self.updated_at = current_time
class JSONDatabase:
"""JSON-based database for storing Discord user data."""
def __init__(self, database_path: str):
"""Initialize the JSON database."""
self.database_path = Path(database_path)
self.backup_path = Path("data/backups")
self.logger = logging.getLogger(__name__)
self._lock = asyncio.Lock()
self._data: Dict[str, Dict] = {}
# Ensure database directory exists
self.database_path.parent.mkdir(parents=True, exist_ok=True)
self.backup_path.mkdir(parents=True, exist_ok=True)
# Load existing data
self._load_data()
def _load_data(self):
"""Load data from JSON file."""
if self.database_path.exists():
try:
with open(self.database_path, 'r', encoding='utf-8') as f:
self._data = json.load(f)
self.logger.info(f"Loaded {len(self._data)} users from database")
except Exception as e:
self.logger.error(f"Error loading database: {e}")
self._data = {}
else:
self._data = {}
self.logger.info("Created new database")
async def _save_data(self):
"""Save data to JSON file."""
async with self._lock:
try:
# Create backup before saving
if self.database_path.exists():
backup_filename = f"users_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
backup_path = self.backup_path / backup_filename
shutil.copy2(self.database_path, backup_path)
# Save data
with open(self.database_path, 'w', encoding='utf-8') as f:
json.dump(self._data, f, indent=2, ensure_ascii=False)
self.logger.debug(f"Saved {len(self._data)} users to database")
except Exception as e:
self.logger.error(f"Error saving database: {e}")
async def get_user(self, user_id: int) -> Optional[UserData]:
"""Get user data by ID."""
user_key = str(user_id)
if user_key in self._data:
user_dict = self._data[user_key]
return UserData(**user_dict)
return None
async def save_user(self, user_data: UserData):
"""Save or update user data."""
user_key = str(user_data.user_id)
# If user exists, preserve created_at timestamp
if user_key in self._data:
user_data.created_at = self._data[user_key]['created_at']
# Update timestamp
user_data.updated_at = datetime.utcnow().isoformat()
# Save to memory
self._data[user_key] = asdict(user_data)
# Save to disk
await self._save_data()
self.logger.debug(f"Saved user {user_data.username}#{user_data.discriminator} ({user_data.user_id})")
async def add_server_to_user(self, user_id: int, server_id: int):
"""Add a server to user's server list."""
user_key = str(user_id)
if user_key in self._data:
if server_id not in self._data[user_key]['servers']:
self._data[user_key]['servers'].append(server_id)
self._data[user_key]['updated_at'] = datetime.utcnow().isoformat()
await self._save_data()
async def get_all_users(self) -> List[UserData]:
"""Get all users from the database."""
return [UserData(**user_dict) for user_dict in self._data.values()]
async def get_users_by_server(self, server_id: int) -> List[UserData]:
"""Get all users that are members of a specific server."""
users = []
for user_dict in self._data.values():
if server_id in user_dict.get('servers', []):
users.append(UserData(**user_dict))
return users
async def get_user_count(self) -> int:
"""Get total number of users in database."""
return len(self._data)
async def get_server_count(self) -> int:
"""Get total number of unique servers."""
servers = set()
for user_dict in self._data.values():
servers.update(user_dict.get('servers', []))
return len(servers)
async def cleanup_old_backups(self, max_backups: int = 10):
"""Clean up old backup files, keeping only the most recent ones."""
backup_files = sorted(self.backup_path.glob("users_backup_*.json"))
if len(backup_files) > max_backups:
files_to_remove = backup_files[:-max_backups]
for file_path in files_to_remove:
try:
file_path.unlink()
self.logger.info(f"Removed old backup: {file_path.name}")
except Exception as e:
self.logger.error(f"Error removing backup {file_path.name}: {e}")
async def export_to_csv(self, output_path: str):
"""Export user data to CSV format."""
import csv
output_path = Path(output_path)
try:
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['user_id', 'username', 'discriminator', 'display_name',
'avatar_url', 'bio', 'status', 'servers', 'created_at', 'updated_at']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for user_dict in self._data.values():
# Convert servers list to string
user_dict_copy = user_dict.copy()
user_dict_copy['servers'] = ','.join(map(str, user_dict.get('servers', [])))
writer.writerow(user_dict_copy)
self.logger.info(f"Exported {len(self._data)} users to {output_path}")
except Exception as e:
self.logger.error(f"Error exporting to CSV: {e}")
async def get_statistics(self) -> Dict[str, Any]:
"""Get database statistics."""
stats = {
'total_users': await self.get_user_count(),
'total_servers': await self.get_server_count(),
'database_size': self.database_path.stat().st_size if self.database_path.exists() else 0
}
# Most active servers
server_counts = {}
for user_dict in self._data.values():
for server_id in user_dict.get('servers', []):
server_counts[server_id] = server_counts.get(server_id, 0) + 1
stats['most_active_servers'] = sorted(server_counts.items(),
key=lambda x: x[1], reverse=True)[:10]
return stats

61
src/logger.py Normal file
View file

@ -0,0 +1,61 @@
"""
Logging setup for Discord Data Collector.
"""
import logging
import sys
from pathlib import Path
from logging.handlers import RotatingFileHandler
def setup_logger(log_level: str = "INFO", log_file: str = "logs/collector.log") -> logging.Logger:
"""
Setup logging configuration.
Args:
log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
log_file: Path to log file
Returns:
Configured logger instance
"""
# Create logs directory if it doesn't exist
log_path = Path(log_file)
log_path.parent.mkdir(parents=True, exist_ok=True)
# Create formatter
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
# Setup root logger
root_logger = logging.getLogger()
root_logger.setLevel(getattr(logging, log_level.upper()))
# Clear existing handlers
root_logger.handlers.clear()
# Console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(getattr(logging, log_level.upper()))
console_handler.setFormatter(formatter)
root_logger.addHandler(console_handler)
# File handler with rotation
file_handler = RotatingFileHandler(
log_file,
maxBytes=10*1024*1024, # 10MB
backupCount=5
)
file_handler.setLevel(getattr(logging, log_level.upper()))
file_handler.setFormatter(formatter)
root_logger.addHandler(file_handler)
# Set specific logger levels for discord.py to reduce noise
logging.getLogger('discord').setLevel(logging.WARNING)
logging.getLogger('discord.http').setLevel(logging.WARNING)
logging.getLogger('discord.gateway').setLevel(logging.WARNING)
return root_logger

70
src/rate_limiter.py Normal file
View file

@ -0,0 +1,70 @@
"""
Rate limiter for Discord API requests.
"""
import asyncio
import time
from collections import deque
from typing import Optional
import logging
class RateLimiter:
"""Rate limiter to prevent hitting Discord API limits."""
def __init__(self, requests_per_minute: int = 30, delay_between_requests: float = 1.0):
"""
Initialize rate limiter.
Args:
requests_per_minute: Maximum requests per minute
delay_between_requests: Minimum delay between requests in seconds
"""
self.requests_per_minute = requests_per_minute
self.delay_between_requests = delay_between_requests
self.request_times = deque()
self.last_request_time = 0
self.logger = logging.getLogger(__name__)
async def wait(self):
"""Wait if necessary to respect rate limits."""
current_time = time.time()
# Remove old requests (older than 1 minute)
while self.request_times and current_time - self.request_times[0] > 60:
self.request_times.popleft()
# Check if we're at the rate limit
if len(self.request_times) >= self.requests_per_minute:
# Wait until the oldest request is more than 1 minute old
wait_time = 60 - (current_time - self.request_times[0])
if wait_time > 0:
self.logger.debug(f"Rate limit reached, waiting {wait_time:.2f} seconds")
await asyncio.sleep(wait_time)
current_time = time.time()
# Check minimum delay between requests
time_since_last = current_time - self.last_request_time
if time_since_last < self.delay_between_requests:
wait_time = self.delay_between_requests - time_since_last
await asyncio.sleep(wait_time)
current_time = time.time()
# Record this request
self.request_times.append(current_time)
self.last_request_time = current_time
def get_stats(self) -> dict:
"""Get rate limiter statistics."""
current_time = time.time()
# Clean old requests
while self.request_times and current_time - self.request_times[0] > 60:
self.request_times.popleft()
return {
'requests_last_minute': len(self.request_times),
'requests_per_minute_limit': self.requests_per_minute,
'delay_between_requests': self.delay_between_requests,
'time_since_last_request': current_time - self.last_request_time
}