Initial commit v2
This commit is contained in:
parent
5b961b3853
commit
8edda894db
13
.env.example
Normal file
13
.env.example
Normal file
|
@ -0,0 +1,13 @@
|
|||
# Discord Data Collector Environment Variables
|
||||
# Copy this file to .env and fill in your values
|
||||
|
||||
# Discord user token (REQUIRED)
|
||||
# WARNING: This should be your user token, not a bot token
|
||||
# Keep this secret and never share it publicly
|
||||
DISCORD_TOKEN=your_discord_user_token_here
|
||||
|
||||
# Optional: Database connection string for future MongoDB integration
|
||||
# MONGODB_URI=mongodb://localhost:27017/discord_research
|
||||
|
||||
# Optional: Additional API keys for extended functionality
|
||||
# BACKUP_WEBHOOK_URL=https://discord.com/api/webhooks/your_webhook_url
|
182
cli.py
Normal file
182
cli.py
Normal file
|
@ -0,0 +1,182 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Command-line interface for Discord Data Collector.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path
|
||||
sys.path.append(str(Path(__file__).parent))
|
||||
|
||||
from src.config import Config
|
||||
from src.database import JSONDatabase
|
||||
from src.client import DiscordDataClient
|
||||
|
||||
|
||||
async def export_data(format_type: str, output_path: str = None):
|
||||
"""Export collected data."""
|
||||
config = Config()
|
||||
database = JSONDatabase(config.database_path)
|
||||
|
||||
if output_path is None:
|
||||
from datetime import datetime
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_path = f"data/export_{timestamp}.{format_type}"
|
||||
|
||||
if format_type == "csv":
|
||||
await database.export_to_csv(output_path)
|
||||
print(f"Data exported to {output_path}")
|
||||
else:
|
||||
print(f"Unsupported format: {format_type}")
|
||||
|
||||
|
||||
async def show_stats():
|
||||
"""Show database statistics."""
|
||||
config = Config()
|
||||
database = JSONDatabase(config.database_path)
|
||||
|
||||
stats = await database.get_statistics()
|
||||
|
||||
print("\n=== Database Statistics ===")
|
||||
print(f"Total users: {stats['total_users']}")
|
||||
print(f"Total servers: {stats['total_servers']}")
|
||||
print(f"Database size: {stats['database_size']} bytes")
|
||||
|
||||
if stats['most_active_servers']:
|
||||
print("\nMost active servers:")
|
||||
for server_id, user_count in stats['most_active_servers'][:5]:
|
||||
print(f" Server {server_id}: {user_count} users")
|
||||
|
||||
|
||||
async def search_user(query: str):
|
||||
"""Search for users."""
|
||||
config = Config()
|
||||
database = JSONDatabase(config.database_path)
|
||||
|
||||
all_users = await database.get_all_users()
|
||||
|
||||
# Search by username or user ID
|
||||
results = []
|
||||
for user in all_users:
|
||||
if (query.lower() in user.username.lower() or
|
||||
query.lower() in (user.display_name or "").lower() or
|
||||
query == str(user.user_id)):
|
||||
results.append(user)
|
||||
|
||||
if not results:
|
||||
print("No users found matching the query.")
|
||||
return
|
||||
|
||||
print(f"\n=== Found {len(results)} users ===")
|
||||
for user in results[:10]: # Show first 10 results
|
||||
print(f"{user.username}#{user.discriminator} (ID: {user.user_id})")
|
||||
if user.display_name:
|
||||
print(f" Display name: {user.display_name}")
|
||||
if user.bio:
|
||||
print(f" Bio: {user.bio[:100]}...")
|
||||
print(f" Servers: {len(user.servers)}")
|
||||
print(f" Last updated: {user.updated_at}")
|
||||
print()
|
||||
|
||||
|
||||
async def backup_database():
|
||||
"""Create a manual backup of the database."""
|
||||
config = Config()
|
||||
database = JSONDatabase(config.database_path)
|
||||
|
||||
from datetime import datetime
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
backup_path = f"data/backups/manual_backup_{timestamp}.json"
|
||||
|
||||
# Copy current database
|
||||
import shutil
|
||||
shutil.copy2(database.database_path, backup_path)
|
||||
|
||||
print(f"Database backed up to {backup_path}")
|
||||
|
||||
|
||||
async def cleanup_data():
|
||||
"""Clean up old data and backups."""
|
||||
config = Config()
|
||||
database = JSONDatabase(config.database_path)
|
||||
|
||||
await database.cleanup_old_backups(max_backups=5)
|
||||
print("Cleanup completed")
|
||||
|
||||
|
||||
async def test_connection():
|
||||
"""Test Discord connection."""
|
||||
try:
|
||||
config = Config()
|
||||
database = JSONDatabase(config.database_path)
|
||||
client = DiscordDataClient(config, database)
|
||||
|
||||
print("Testing Discord connection...")
|
||||
|
||||
# This will test the connection without starting the full bot
|
||||
await client.login(config.discord_token)
|
||||
user_info = client.user
|
||||
|
||||
print(f"✓ Successfully connected as {user_info.name}#{user_info.discriminator}")
|
||||
print(f"✓ User ID: {user_info.id}")
|
||||
|
||||
await client.close()
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Connection failed: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main CLI entry point."""
|
||||
parser = argparse.ArgumentParser(description="Discord Data Collector CLI")
|
||||
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
||||
|
||||
# Export command
|
||||
export_parser = subparsers.add_parser("export", help="Export collected data")
|
||||
export_parser.add_argument("format", choices=["csv"], help="Export format")
|
||||
export_parser.add_argument("-o", "--output", help="Output file path")
|
||||
|
||||
# Stats command
|
||||
subparsers.add_parser("stats", help="Show database statistics")
|
||||
|
||||
# Search command
|
||||
search_parser = subparsers.add_parser("search", help="Search for users")
|
||||
search_parser.add_argument("query", help="Search query (username or user ID)")
|
||||
|
||||
# Backup command
|
||||
subparsers.add_parser("backup", help="Create manual database backup")
|
||||
|
||||
# Cleanup command
|
||||
subparsers.add_parser("cleanup", help="Clean up old data and backups")
|
||||
|
||||
# Test command
|
||||
subparsers.add_parser("test", help="Test Discord connection")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.command:
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
# Run the appropriate command
|
||||
if args.command == "export":
|
||||
asyncio.run(export_data(args.format, args.output))
|
||||
elif args.command == "stats":
|
||||
asyncio.run(show_stats())
|
||||
elif args.command == "search":
|
||||
asyncio.run(search_user(args.query))
|
||||
elif args.command == "backup":
|
||||
asyncio.run(backup_database())
|
||||
elif args.command == "cleanup":
|
||||
asyncio.run(cleanup_data())
|
||||
elif args.command == "test":
|
||||
asyncio.run(test_connection())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
33
config.toml
Normal file
33
config.toml
Normal file
|
@ -0,0 +1,33 @@
|
|||
# Discord Data Collector Configuration
|
||||
|
||||
[database]
|
||||
# JSON database file path
|
||||
path = "data/users.json"
|
||||
# Backup interval in seconds (3600 = 1 hour)
|
||||
backup_interval = 3600
|
||||
|
||||
[collection]
|
||||
# What data to collect
|
||||
profile_pictures = true
|
||||
bio = true
|
||||
status = true
|
||||
server_membership = true
|
||||
|
||||
[rate_limiting]
|
||||
# Delay between API requests in seconds
|
||||
request_delay = 1.0
|
||||
# Maximum requests per minute
|
||||
max_requests_per_minute = 30
|
||||
|
||||
[monitoring]
|
||||
# List of specific server IDs to monitor (leave empty to monitor all)
|
||||
# Example: target_servers = [123456789, 987654321]
|
||||
target_servers = []
|
||||
# Monitor all servers the account is in
|
||||
monitor_all_servers = true
|
||||
|
||||
[logging]
|
||||
# Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL
|
||||
level = "INFO"
|
||||
# Log file path
|
||||
file = "logs/collector.log"
|
50
main.py
Normal file
50
main.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Discord User Data Collector
|
||||
Main application entry point for collecting Discord user data for research purposes.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from src.client import DiscordDataClient
|
||||
from src.config import Config
|
||||
from src.database import JSONDatabase
|
||||
from src.logger import setup_logger
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main application entry point."""
|
||||
try:
|
||||
# Setup configuration
|
||||
config = Config()
|
||||
|
||||
# Setup logging
|
||||
logger = setup_logger(config.log_level, config.log_file)
|
||||
logger.info("Starting Discord Data Collector")
|
||||
|
||||
# Initialize database
|
||||
database = JSONDatabase(config.database_path)
|
||||
|
||||
# Initialize Discord client
|
||||
client = DiscordDataClient(config, database)
|
||||
|
||||
# Start the client
|
||||
logger.info("Starting Discord client...")
|
||||
await client.start(config.discord_token)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Received keyboard interrupt, shutting down...")
|
||||
except Exception as e:
|
||||
logger.error(f"Fatal error: {e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
finally:
|
||||
if 'client' in locals():
|
||||
await client.close()
|
||||
logger.info("Application shutdown complete")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
208
readme.md
Normal file
208
readme.md
Normal file
|
@ -0,0 +1,208 @@
|
|||
# ⚠️ Warning! ai slop.
|
||||
|
||||
|
||||
# Discord Data Collector
|
||||
|
||||
A Python application for collecting Discord user data for research purposes, specifically designed to study information propagation patterns in Discord communities.
|
||||
|
||||
## Important Disclaimers
|
||||
|
||||
- **Terms of Service**: This application uses self-botting, which violates Discord's Terms of Service and may result in account suspension.
|
||||
- **Educational Use Only**: This tool is intended solely for educational and research purposes.
|
||||
- **Privacy Considerations**: Always respect user privacy and obtain proper consent when collecting data.
|
||||
- **Legal Compliance**: Ensure compliance with applicable data protection laws (GDPR, CCPA, etc.).
|
||||
|
||||
## Features
|
||||
|
||||
- **User Data Collection**: Automatically collects usernames, profile pictures, bios, status, and server memberships
|
||||
- **Message Monitoring**: Processes messages from monitored servers to identify active users
|
||||
- **Rate Limiting**: Built-in rate limiting to avoid hitting Discord API limits
|
||||
- **Flexible Configuration**: Easy configuration via TOML and environment files
|
||||
- **Data Export**: Export collected data to CSV format
|
||||
- **Database Management**: JSON-based storage with automatic backups
|
||||
- **CLI Tools**: Command-line interface for data management and analysis
|
||||
|
||||
## Installation
|
||||
|
||||
1. **Clone the repository**:
|
||||
```bash
|
||||
git clone <repository-url>
|
||||
cd discord-data-collector
|
||||
```
|
||||
|
||||
2. **Install dependencies**:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. **Create configuration files**:
|
||||
```bash
|
||||
cp .env.example .env
|
||||
# Edit .env with your Discord token
|
||||
```
|
||||
|
||||
4. **Configure settings**:
|
||||
- Edit `config.toml` to adjust collection settings
|
||||
- Add your Discord user token to `.env`
|
||||
|
||||
## Configuration
|
||||
|
||||
### Environment Variables (.env)
|
||||
|
||||
```env
|
||||
# Your Discord user token (REQUIRED)
|
||||
DISCORD_TOKEN=your_discord_user_token_here
|
||||
```
|
||||
|
||||
### Configuration File (config.toml)
|
||||
|
||||
```toml
|
||||
[database]
|
||||
path = "data/users.json"
|
||||
backup_interval = 3600
|
||||
|
||||
[collection]
|
||||
profile_pictures = true
|
||||
bio = true
|
||||
status = true
|
||||
server_membership = true
|
||||
|
||||
[rate_limiting]
|
||||
request_delay = 1.0
|
||||
max_requests_per_minute = 30
|
||||
|
||||
[monitoring]
|
||||
target_servers = [] # Empty = monitor all servers
|
||||
monitor_all_servers = true
|
||||
|
||||
[logging]
|
||||
level = "INFO"
|
||||
file = "logs/collector.log"
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Running the Collector
|
||||
|
||||
```bash
|
||||
# Start the data collector
|
||||
python main.py
|
||||
```
|
||||
|
||||
### CLI Commands
|
||||
|
||||
```bash
|
||||
# Show database statistics
|
||||
python cli.py stats
|
||||
|
||||
# Search for users
|
||||
python cli.py search "username"
|
||||
|
||||
# Export data to CSV
|
||||
python cli.py export csv -o exported_data.csv
|
||||
|
||||
# Test Discord connection
|
||||
python cli.py test
|
||||
|
||||
# Create manual backup
|
||||
python cli.py backup
|
||||
|
||||
# Clean up old backups
|
||||
python cli.py cleanup
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
discord-data-collector/
|
||||
├── main.py # Main application entry point
|
||||
├── cli.py # Command-line interface
|
||||
├── config.toml # Configuration file
|
||||
├── .env # Environment variables
|
||||
├── requirements.txt # Python dependencies
|
||||
├── src/
|
||||
│ ├── __init__.py
|
||||
│ ├── client.py # Discord client implementation
|
||||
│ ├── config.py # Configuration management
|
||||
│ ├── database.py # JSON database manager
|
||||
│ ├── rate_limiter.py # Rate limiting utilities
|
||||
│ └── logger.py # Logging setup
|
||||
├── data/
|
||||
│ ├── users.json # User database
|
||||
│ └── backups/ # Database backups
|
||||
└── logs/
|
||||
└── collector.log # Application logs
|
||||
```
|
||||
|
||||
## Data Structure
|
||||
|
||||
Each user entry contains:
|
||||
|
||||
```json
|
||||
{
|
||||
"user_id": 123456789,
|
||||
"username": "example_user",
|
||||
"discriminator": "1234",
|
||||
"display_name": "Example User",
|
||||
"avatar_url": "https://cdn.discordapp.com/avatars/...",
|
||||
"banner_url": "https://cdn.discordapp.com/banners/...",
|
||||
"bio": "User's about me section",
|
||||
"status": "online",
|
||||
"activity": "Playing a game",
|
||||
"servers": [111111111, 222222222],
|
||||
"created_at": "2024-01-01T00:00:00",
|
||||
"updated_at": "2024-01-01T12:00:00"
|
||||
}
|
||||
```
|
||||
|
||||
## Features in Detail
|
||||
|
||||
### Rate Limiting
|
||||
- Configurable request delays
|
||||
- Per-minute request limits
|
||||
- Automatic backoff on rate limit hits
|
||||
|
||||
### Data Collection
|
||||
- Real-time message monitoring
|
||||
- Member list scanning
|
||||
- Profile updates tracking
|
||||
- Server membership tracking
|
||||
|
||||
### Database Management
|
||||
- Automatic backups
|
||||
- Data deduplication
|
||||
- Export capabilities
|
||||
- Statistics generation
|
||||
|
||||
### Logging
|
||||
- Configurable log levels
|
||||
- File rotation
|
||||
- Separate Discord.py logging
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
- MongoDB integration for better scalability
|
||||
- Web dashboard for data visualization
|
||||
- Advanced search and filtering
|
||||
- Data analysis tools
|
||||
- Network analysis features
|
||||
|
||||
## Contributing
|
||||
|
||||
1. Fork the repository
|
||||
2. Create a feature branch
|
||||
3. Make your changes
|
||||
4. Add tests if applicable
|
||||
5. Submit a pull request
|
||||
|
||||
## License
|
||||
|
||||
This project is for educational purposes only. Use responsibly and in compliance with applicable laws and terms of service.
|
||||
|
||||
## Support
|
||||
|
||||
For issues or questions, please create an issue in the repository.
|
||||
|
||||
---
|
||||
|
||||
**Remember**: This tool is for educational research only. Always respect user privacy and platform terms of service.
|
20
requirements.txt
Normal file
20
requirements.txt
Normal file
|
@ -0,0 +1,20 @@
|
|||
# Discord Data Collector Requirements
|
||||
|
||||
# Discord self-bot library
|
||||
discord.py-self>=2.0.0
|
||||
|
||||
# Configuration management
|
||||
python-dotenv>=1.0.0
|
||||
toml>=0.10.2
|
||||
|
||||
# Database (for future MongoDB integration)
|
||||
pymongo>=4.0.0
|
||||
|
||||
# Async utilities
|
||||
asyncio-throttle>=1.0.0
|
||||
|
||||
# Data processing
|
||||
pandas>=1.5.0
|
||||
|
||||
# Logging
|
||||
colorlog>=6.0.0
|
43
src/__init__.py
Normal file
43
src/__init__.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
"""
|
||||
Discord Data Collector - A tool for collecting Discord user data for research purposes.
|
||||
|
||||
This package provides functionality to collect user data from Discord servers
|
||||
for academic research, particularly focused on studying information propagation
|
||||
and community dynamics.
|
||||
|
||||
Components:
|
||||
- client: Discord client implementation
|
||||
- config: Configuration management
|
||||
- database: Data storage and management
|
||||
- rate_limiter: API rate limiting
|
||||
- logger: Logging utilities
|
||||
|
||||
Usage:
|
||||
from src.client import DiscordDataClient
|
||||
from src.config import Config
|
||||
from src.database import JSONDatabase
|
||||
|
||||
config = Config()
|
||||
database = JSONDatabase(config.database_path)
|
||||
client = DiscordDataClient(config, database)
|
||||
"""
|
||||
|
||||
__version__ = "1.0.0"
|
||||
__author__ = "Research Team"
|
||||
__description__ = "Discord Data Collector for Research Purposes"
|
||||
|
||||
# Import main classes for easier access
|
||||
from .client import DiscordDataClient
|
||||
from .config import Config
|
||||
from .database import JSONDatabase, UserData
|
||||
from .rate_limiter import RateLimiter
|
||||
from .logger import setup_logger
|
||||
|
||||
__all__ = [
|
||||
'DiscordDataClient',
|
||||
'Config',
|
||||
'JSONDatabase',
|
||||
'UserData',
|
||||
'RateLimiter',
|
||||
'setup_logger'
|
||||
]
|
205
src/client.py
Normal file
205
src/client.py
Normal file
|
@ -0,0 +1,205 @@
|
|||
"""
|
||||
JSON database manager for Discord user data storage.
|
||||
"""
|
||||
|
||||
import json
|
||||
import asyncio
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
from dataclasses import dataclass, asdict
|
||||
import logging
|
||||
|
||||
|
||||
@dataclass
|
||||
class UserData:
|
||||
"""Data structure for storing user information."""
|
||||
user_id: int
|
||||
username: str
|
||||
discriminator: str
|
||||
display_name: Optional[str] = None
|
||||
avatar_url: Optional[str] = None
|
||||
banner_url: Optional[str] = None
|
||||
bio: Optional[str] = None
|
||||
status: Optional[str] = None
|
||||
activity: Optional[str] = None
|
||||
servers: List[int] = None
|
||||
created_at: str = None
|
||||
updated_at: str = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.servers is None:
|
||||
self.servers = []
|
||||
|
||||
current_time = datetime.utcnow().isoformat()
|
||||
if self.created_at is None:
|
||||
self.created_at = current_time
|
||||
self.updated_at = current_time
|
||||
|
||||
|
||||
class JSONDatabase:
|
||||
"""JSON-based database for storing Discord user data."""
|
||||
|
||||
def __init__(self, database_path: str):
|
||||
"""Initialize the JSON database."""
|
||||
self.database_path = Path(database_path)
|
||||
self.backup_path = Path("data/backups")
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._lock = asyncio.Lock()
|
||||
self._data: Dict[str, Dict] = {}
|
||||
|
||||
# Ensure database directory exists
|
||||
self.database_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.backup_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load existing data
|
||||
self._load_data()
|
||||
|
||||
def _load_data(self):
|
||||
"""Load data from JSON file."""
|
||||
if self.database_path.exists():
|
||||
try:
|
||||
with open(self.database_path, 'r', encoding='utf-8') as f:
|
||||
self._data = json.load(f)
|
||||
self.logger.info(f"Loaded {len(self._data)} users from database")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error loading database: {e}")
|
||||
self._data = {}
|
||||
else:
|
||||
self._data = {}
|
||||
self.logger.info("Created new database")
|
||||
|
||||
async def _save_data(self):
|
||||
"""Save data to JSON file."""
|
||||
async with self._lock:
|
||||
try:
|
||||
# Create backup before saving
|
||||
if self.database_path.exists():
|
||||
backup_filename = f"users_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
||||
backup_path = self.backup_path / backup_filename
|
||||
shutil.copy2(self.database_path, backup_path)
|
||||
|
||||
# Save data
|
||||
with open(self.database_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(self._data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
self.logger.debug(f"Saved {len(self._data)} users to database")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error saving database: {e}")
|
||||
|
||||
async def get_user(self, user_id: int) -> Optional[UserData]:
|
||||
"""Get user data by ID."""
|
||||
user_key = str(user_id)
|
||||
if user_key in self._data:
|
||||
user_dict = self._data[user_key]
|
||||
return UserData(**user_dict)
|
||||
return None
|
||||
|
||||
async def save_user(self, user_data: UserData):
|
||||
"""Save or update user data."""
|
||||
user_key = str(user_data.user_id)
|
||||
|
||||
# If user exists, preserve created_at timestamp
|
||||
if user_key in self._data:
|
||||
user_data.created_at = self._data[user_key]['created_at']
|
||||
|
||||
# Update timestamp
|
||||
user_data.updated_at = datetime.utcnow().isoformat()
|
||||
|
||||
# Save to memory
|
||||
self._data[user_key] = asdict(user_data)
|
||||
|
||||
# Save to disk
|
||||
await self._save_data()
|
||||
|
||||
self.logger.debug(f"Saved user {user_data.username}#{user_data.discriminator} ({user_data.user_id})")
|
||||
|
||||
async def add_server_to_user(self, user_id: int, server_id: int):
|
||||
"""Add a server to user's server list."""
|
||||
user_key = str(user_id)
|
||||
if user_key in self._data:
|
||||
if server_id not in self._data[user_key]['servers']:
|
||||
self._data[user_key]['servers'].append(server_id)
|
||||
self._data[user_key]['updated_at'] = datetime.utcnow().isoformat()
|
||||
await self._save_data()
|
||||
|
||||
async def get_all_users(self) -> List[UserData]:
|
||||
"""Get all users from the database."""
|
||||
return [UserData(**user_dict) for user_dict in self._data.values()]
|
||||
|
||||
async def get_users_by_server(self, server_id: int) -> List[UserData]:
|
||||
"""Get all users that are members of a specific server."""
|
||||
users = []
|
||||
for user_dict in self._data.values():
|
||||
if server_id in user_dict.get('servers', []):
|
||||
users.append(UserData(**user_dict))
|
||||
return users
|
||||
|
||||
async def get_user_count(self) -> int:
|
||||
"""Get total number of users in database."""
|
||||
return len(self._data)
|
||||
|
||||
async def get_server_count(self) -> int:
|
||||
"""Get total number of unique servers."""
|
||||
servers = set()
|
||||
for user_dict in self._data.values():
|
||||
servers.update(user_dict.get('servers', []))
|
||||
return len(servers)
|
||||
|
||||
async def cleanup_old_backups(self, max_backups: int = 10):
|
||||
"""Clean up old backup files, keeping only the most recent ones."""
|
||||
backup_files = sorted(self.backup_path.glob("users_backup_*.json"))
|
||||
|
||||
if len(backup_files) > max_backups:
|
||||
files_to_remove = backup_files[:-max_backups]
|
||||
for file_path in files_to_remove:
|
||||
try:
|
||||
file_path.unlink()
|
||||
self.logger.info(f"Removed old backup: {file_path.name}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error removing backup {file_path.name}: {e}")
|
||||
|
||||
async def export_to_csv(self, output_path: str):
|
||||
"""Export user data to CSV format."""
|
||||
import csv
|
||||
|
||||
output_path = Path(output_path)
|
||||
|
||||
try:
|
||||
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['user_id', 'username', 'discriminator', 'display_name',
|
||||
'avatar_url', 'bio', 'status', 'servers', 'created_at', 'updated_at']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
|
||||
writer.writeheader()
|
||||
for user_dict in self._data.values():
|
||||
# Convert servers list to string
|
||||
user_dict_copy = user_dict.copy()
|
||||
user_dict_copy['servers'] = ','.join(map(str, user_dict.get('servers', [])))
|
||||
writer.writerow(user_dict_copy)
|
||||
|
||||
self.logger.info(f"Exported {len(self._data)} users to {output_path}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error exporting to CSV: {e}")
|
||||
|
||||
async def get_statistics(self) -> Dict[str, Any]:
|
||||
"""Get database statistics."""
|
||||
stats = {
|
||||
'total_users': await self.get_user_count(),
|
||||
'total_servers': await self.get_server_count(),
|
||||
'database_size': self.database_path.stat().st_size if self.database_path.exists() else 0
|
||||
}
|
||||
|
||||
# Most active servers
|
||||
server_counts = {}
|
||||
for user_dict in self._data.values():
|
||||
for server_id in user_dict.get('servers', []):
|
||||
server_counts[server_id] = server_counts.get(server_id, 0) + 1
|
||||
|
||||
stats['most_active_servers'] = sorted(server_counts.items(),
|
||||
key=lambda x: x[1], reverse=True)[:10]
|
||||
|
||||
return stats
|
121
src/config.py
Normal file
121
src/config.py
Normal file
|
@ -0,0 +1,121 @@
|
|||
"""
|
||||
Configuration management for Discord Data Collector.
|
||||
"""
|
||||
|
||||
import os
|
||||
import toml
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
class Config:
|
||||
"""Configuration manager for the Discord Data Collector."""
|
||||
|
||||
def __init__(self, config_file: str = "config.toml"):
|
||||
"""Initialize configuration from TOML file and environment variables."""
|
||||
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
# Load TOML configuration
|
||||
self.config_file = Path(config_file)
|
||||
self.config_data = self._load_config()
|
||||
|
||||
# Discord settings
|
||||
self.discord_token = os.getenv("DISCORD_TOKEN")
|
||||
if not self.discord_token:
|
||||
raise ValueError("DISCORD_TOKEN environment variable is required")
|
||||
|
||||
# Database settings
|
||||
self.database_path = self.config_data.get("database", {}).get("path", "data/users.json")
|
||||
self.backup_interval = self.config_data.get("database", {}).get("backup_interval", 3600)
|
||||
|
||||
# Collection settings
|
||||
collection_config = self.config_data.get("collection", {})
|
||||
self.collect_profile_pics = collection_config.get("profile_pictures", True)
|
||||
self.collect_bio = collection_config.get("bio", True)
|
||||
self.collect_status = collection_config.get("status", True)
|
||||
self.collect_server_membership = collection_config.get("server_membership", True)
|
||||
|
||||
# Rate limiting settings
|
||||
rate_limit_config = self.config_data.get("rate_limiting", {})
|
||||
self.request_delay = rate_limit_config.get("request_delay", 1.0)
|
||||
self.max_requests_per_minute = rate_limit_config.get("max_requests_per_minute", 30)
|
||||
|
||||
# Monitoring settings
|
||||
monitoring_config = self.config_data.get("monitoring", {})
|
||||
self.target_servers = monitoring_config.get("target_servers", [])
|
||||
self.monitor_all_servers = monitoring_config.get("monitor_all_servers", True)
|
||||
|
||||
# Logging settings
|
||||
logging_config = self.config_data.get("logging", {})
|
||||
self.log_level = logging_config.get("level", "INFO")
|
||||
self.log_file = logging_config.get("file", "logs/collector.log")
|
||||
|
||||
# Ensure directories exist
|
||||
self._ensure_directories()
|
||||
|
||||
def _load_config(self) -> dict:
|
||||
"""Load configuration from TOML file."""
|
||||
if not self.config_file.exists():
|
||||
self._create_default_config()
|
||||
|
||||
try:
|
||||
with open(self.config_file, 'r') as f:
|
||||
return toml.load(f)
|
||||
except Exception as e:
|
||||
print(f"Error loading config file: {e}")
|
||||
return {}
|
||||
|
||||
def _create_default_config(self):
|
||||
"""Create a default configuration file."""
|
||||
default_config = {
|
||||
"database": {
|
||||
"path": "data/users.json",
|
||||
"backup_interval": 3600
|
||||
},
|
||||
"collection": {
|
||||
"profile_pictures": True,
|
||||
"bio": True,
|
||||
"status": True,
|
||||
"server_membership": True
|
||||
},
|
||||
"rate_limiting": {
|
||||
"request_delay": 1.0,
|
||||
"max_requests_per_minute": 30
|
||||
},
|
||||
"monitoring": {
|
||||
"target_servers": [],
|
||||
"monitor_all_servers": True
|
||||
},
|
||||
"logging": {
|
||||
"level": "INFO",
|
||||
"file": "logs/collector.log"
|
||||
}
|
||||
}
|
||||
|
||||
# Create directory if it doesn't exist
|
||||
self.config_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(self.config_file, 'w') as f:
|
||||
toml.dump(default_config, f)
|
||||
|
||||
print(f"Created default configuration file: {self.config_file}")
|
||||
|
||||
def _ensure_directories(self):
|
||||
"""Ensure required directories exist."""
|
||||
directories = [
|
||||
Path(self.database_path).parent,
|
||||
Path(self.log_file).parent,
|
||||
Path("data/backups")
|
||||
]
|
||||
|
||||
for directory in directories:
|
||||
directory.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def get_target_servers(self) -> List[int]:
|
||||
"""Get list of target server IDs."""
|
||||
if self.monitor_all_servers:
|
||||
return []
|
||||
return [int(server_id) for server_id in self.target_servers]
|
205
src/database.py
Normal file
205
src/database.py
Normal file
|
@ -0,0 +1,205 @@
|
|||
"""
|
||||
JSON database manager for Discord user data storage.
|
||||
"""
|
||||
|
||||
import json
|
||||
import asyncio
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
from dataclasses import dataclass, asdict
|
||||
import logging
|
||||
|
||||
|
||||
@dataclass
|
||||
class UserData:
|
||||
"""Data structure for storing user information."""
|
||||
user_id: int
|
||||
username: str
|
||||
discriminator: str
|
||||
display_name: Optional[str] = None
|
||||
avatar_url: Optional[str] = None
|
||||
banner_url: Optional[str] = None
|
||||
bio: Optional[str] = None
|
||||
status: Optional[str] = None
|
||||
activity: Optional[str] = None
|
||||
servers: List[int] = None
|
||||
created_at: str = None
|
||||
updated_at: str = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.servers is None:
|
||||
self.servers = []
|
||||
|
||||
current_time = datetime.utcnow().isoformat()
|
||||
if self.created_at is None:
|
||||
self.created_at = current_time
|
||||
self.updated_at = current_time
|
||||
|
||||
|
||||
class JSONDatabase:
|
||||
"""JSON-based database for storing Discord user data."""
|
||||
|
||||
def __init__(self, database_path: str):
|
||||
"""Initialize the JSON database."""
|
||||
self.database_path = Path(database_path)
|
||||
self.backup_path = Path("data/backups")
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._lock = asyncio.Lock()
|
||||
self._data: Dict[str, Dict] = {}
|
||||
|
||||
# Ensure database directory exists
|
||||
self.database_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.backup_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load existing data
|
||||
self._load_data()
|
||||
|
||||
def _load_data(self):
|
||||
"""Load data from JSON file."""
|
||||
if self.database_path.exists():
|
||||
try:
|
||||
with open(self.database_path, 'r', encoding='utf-8') as f:
|
||||
self._data = json.load(f)
|
||||
self.logger.info(f"Loaded {len(self._data)} users from database")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error loading database: {e}")
|
||||
self._data = {}
|
||||
else:
|
||||
self._data = {}
|
||||
self.logger.info("Created new database")
|
||||
|
||||
async def _save_data(self):
|
||||
"""Save data to JSON file."""
|
||||
async with self._lock:
|
||||
try:
|
||||
# Create backup before saving
|
||||
if self.database_path.exists():
|
||||
backup_filename = f"users_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
||||
backup_path = self.backup_path / backup_filename
|
||||
shutil.copy2(self.database_path, backup_path)
|
||||
|
||||
# Save data
|
||||
with open(self.database_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(self._data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
self.logger.debug(f"Saved {len(self._data)} users to database")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error saving database: {e}")
|
||||
|
||||
async def get_user(self, user_id: int) -> Optional[UserData]:
|
||||
"""Get user data by ID."""
|
||||
user_key = str(user_id)
|
||||
if user_key in self._data:
|
||||
user_dict = self._data[user_key]
|
||||
return UserData(**user_dict)
|
||||
return None
|
||||
|
||||
async def save_user(self, user_data: UserData):
|
||||
"""Save or update user data."""
|
||||
user_key = str(user_data.user_id)
|
||||
|
||||
# If user exists, preserve created_at timestamp
|
||||
if user_key in self._data:
|
||||
user_data.created_at = self._data[user_key]['created_at']
|
||||
|
||||
# Update timestamp
|
||||
user_data.updated_at = datetime.utcnow().isoformat()
|
||||
|
||||
# Save to memory
|
||||
self._data[user_key] = asdict(user_data)
|
||||
|
||||
# Save to disk
|
||||
await self._save_data()
|
||||
|
||||
self.logger.debug(f"Saved user {user_data.username}#{user_data.discriminator} ({user_data.user_id})")
|
||||
|
||||
async def add_server_to_user(self, user_id: int, server_id: int):
|
||||
"""Add a server to user's server list."""
|
||||
user_key = str(user_id)
|
||||
if user_key in self._data:
|
||||
if server_id not in self._data[user_key]['servers']:
|
||||
self._data[user_key]['servers'].append(server_id)
|
||||
self._data[user_key]['updated_at'] = datetime.utcnow().isoformat()
|
||||
await self._save_data()
|
||||
|
||||
async def get_all_users(self) -> List[UserData]:
|
||||
"""Get all users from the database."""
|
||||
return [UserData(**user_dict) for user_dict in self._data.values()]
|
||||
|
||||
async def get_users_by_server(self, server_id: int) -> List[UserData]:
|
||||
"""Get all users that are members of a specific server."""
|
||||
users = []
|
||||
for user_dict in self._data.values():
|
||||
if server_id in user_dict.get('servers', []):
|
||||
users.append(UserData(**user_dict))
|
||||
return users
|
||||
|
||||
async def get_user_count(self) -> int:
|
||||
"""Get total number of users in database."""
|
||||
return len(self._data)
|
||||
|
||||
async def get_server_count(self) -> int:
|
||||
"""Get total number of unique servers."""
|
||||
servers = set()
|
||||
for user_dict in self._data.values():
|
||||
servers.update(user_dict.get('servers', []))
|
||||
return len(servers)
|
||||
|
||||
async def cleanup_old_backups(self, max_backups: int = 10):
|
||||
"""Clean up old backup files, keeping only the most recent ones."""
|
||||
backup_files = sorted(self.backup_path.glob("users_backup_*.json"))
|
||||
|
||||
if len(backup_files) > max_backups:
|
||||
files_to_remove = backup_files[:-max_backups]
|
||||
for file_path in files_to_remove:
|
||||
try:
|
||||
file_path.unlink()
|
||||
self.logger.info(f"Removed old backup: {file_path.name}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error removing backup {file_path.name}: {e}")
|
||||
|
||||
async def export_to_csv(self, output_path: str):
|
||||
"""Export user data to CSV format."""
|
||||
import csv
|
||||
|
||||
output_path = Path(output_path)
|
||||
|
||||
try:
|
||||
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['user_id', 'username', 'discriminator', 'display_name',
|
||||
'avatar_url', 'bio', 'status', 'servers', 'created_at', 'updated_at']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
|
||||
writer.writeheader()
|
||||
for user_dict in self._data.values():
|
||||
# Convert servers list to string
|
||||
user_dict_copy = user_dict.copy()
|
||||
user_dict_copy['servers'] = ','.join(map(str, user_dict.get('servers', [])))
|
||||
writer.writerow(user_dict_copy)
|
||||
|
||||
self.logger.info(f"Exported {len(self._data)} users to {output_path}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error exporting to CSV: {e}")
|
||||
|
||||
async def get_statistics(self) -> Dict[str, Any]:
|
||||
"""Get database statistics."""
|
||||
stats = {
|
||||
'total_users': await self.get_user_count(),
|
||||
'total_servers': await self.get_server_count(),
|
||||
'database_size': self.database_path.stat().st_size if self.database_path.exists() else 0
|
||||
}
|
||||
|
||||
# Most active servers
|
||||
server_counts = {}
|
||||
for user_dict in self._data.values():
|
||||
for server_id in user_dict.get('servers', []):
|
||||
server_counts[server_id] = server_counts.get(server_id, 0) + 1
|
||||
|
||||
stats['most_active_servers'] = sorted(server_counts.items(),
|
||||
key=lambda x: x[1], reverse=True)[:10]
|
||||
|
||||
return stats
|
61
src/logger.py
Normal file
61
src/logger.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
"""
|
||||
Logging setup for Discord Data Collector.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from logging.handlers import RotatingFileHandler
|
||||
|
||||
|
||||
def setup_logger(log_level: str = "INFO", log_file: str = "logs/collector.log") -> logging.Logger:
|
||||
"""
|
||||
Setup logging configuration.
|
||||
|
||||
Args:
|
||||
log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
log_file: Path to log file
|
||||
|
||||
Returns:
|
||||
Configured logger instance
|
||||
"""
|
||||
|
||||
# Create logs directory if it doesn't exist
|
||||
log_path = Path(log_file)
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create formatter
|
||||
formatter = logging.Formatter(
|
||||
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
|
||||
# Setup root logger
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(getattr(logging, log_level.upper()))
|
||||
|
||||
# Clear existing handlers
|
||||
root_logger.handlers.clear()
|
||||
|
||||
# Console handler
|
||||
console_handler = logging.StreamHandler(sys.stdout)
|
||||
console_handler.setLevel(getattr(logging, log_level.upper()))
|
||||
console_handler.setFormatter(formatter)
|
||||
root_logger.addHandler(console_handler)
|
||||
|
||||
# File handler with rotation
|
||||
file_handler = RotatingFileHandler(
|
||||
log_file,
|
||||
maxBytes=10*1024*1024, # 10MB
|
||||
backupCount=5
|
||||
)
|
||||
file_handler.setLevel(getattr(logging, log_level.upper()))
|
||||
file_handler.setFormatter(formatter)
|
||||
root_logger.addHandler(file_handler)
|
||||
|
||||
# Set specific logger levels for discord.py to reduce noise
|
||||
logging.getLogger('discord').setLevel(logging.WARNING)
|
||||
logging.getLogger('discord.http').setLevel(logging.WARNING)
|
||||
logging.getLogger('discord.gateway').setLevel(logging.WARNING)
|
||||
|
||||
return root_logger
|
70
src/rate_limiter.py
Normal file
70
src/rate_limiter.py
Normal file
|
@ -0,0 +1,70 @@
|
|||
"""
|
||||
Rate limiter for Discord API requests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from collections import deque
|
||||
from typing import Optional
|
||||
import logging
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Rate limiter to prevent hitting Discord API limits."""
|
||||
|
||||
def __init__(self, requests_per_minute: int = 30, delay_between_requests: float = 1.0):
|
||||
"""
|
||||
Initialize rate limiter.
|
||||
|
||||
Args:
|
||||
requests_per_minute: Maximum requests per minute
|
||||
delay_between_requests: Minimum delay between requests in seconds
|
||||
"""
|
||||
self.requests_per_minute = requests_per_minute
|
||||
self.delay_between_requests = delay_between_requests
|
||||
self.request_times = deque()
|
||||
self.last_request_time = 0
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
async def wait(self):
|
||||
"""Wait if necessary to respect rate limits."""
|
||||
current_time = time.time()
|
||||
|
||||
# Remove old requests (older than 1 minute)
|
||||
while self.request_times and current_time - self.request_times[0] > 60:
|
||||
self.request_times.popleft()
|
||||
|
||||
# Check if we're at the rate limit
|
||||
if len(self.request_times) >= self.requests_per_minute:
|
||||
# Wait until the oldest request is more than 1 minute old
|
||||
wait_time = 60 - (current_time - self.request_times[0])
|
||||
if wait_time > 0:
|
||||
self.logger.debug(f"Rate limit reached, waiting {wait_time:.2f} seconds")
|
||||
await asyncio.sleep(wait_time)
|
||||
current_time = time.time()
|
||||
|
||||
# Check minimum delay between requests
|
||||
time_since_last = current_time - self.last_request_time
|
||||
if time_since_last < self.delay_between_requests:
|
||||
wait_time = self.delay_between_requests - time_since_last
|
||||
await asyncio.sleep(wait_time)
|
||||
current_time = time.time()
|
||||
|
||||
# Record this request
|
||||
self.request_times.append(current_time)
|
||||
self.last_request_time = current_time
|
||||
|
||||
def get_stats(self) -> dict:
|
||||
"""Get rate limiter statistics."""
|
||||
current_time = time.time()
|
||||
|
||||
# Clean old requests
|
||||
while self.request_times and current_time - self.request_times[0] > 60:
|
||||
self.request_times.popleft()
|
||||
|
||||
return {
|
||||
'requests_last_minute': len(self.request_times),
|
||||
'requests_per_minute_limit': self.requests_per_minute,
|
||||
'delay_between_requests': self.delay_between_requests,
|
||||
'time_since_last_request': current_time - self.last_request_time
|
||||
}
|
Loading…
Reference in a new issue