minecraft-crawler/website/collect-mod-descriptions.py
hiina b8ecd6aa37 make basic website scraper thing
Even if modrinth approves this modpack, their description page for
it isn't that good anyway, for figuring out what's all in there.
2024-08-14 19:37:54 -06:00

245 lines
9.1 KiB
Python

#!/usr/bin/env python
"""
From the packwiz toml files, look up the mod descriptions from the modrinth API
and collect them into an html page using yattag.
The toml files look like:
```
name = "almostunified-fabric-1.20.1-0.9.4"
filename = "almostunified-fabric-1.20.1-0.9.4.jar"
side = "both"
[download]
url = "https://cdn.modrinth.com/data/sdaSaQEz/versions/iVBf0ICr/almostunified-fabric-1.20.1-0.9.4.jar"
hash = "ec47335d9d8b98c107a2b4cb4bada845669728f78c65df2ef2ee5e06d9ac866d276d09892896c216e30eb028a6fdd0a6cc92a8741eee1c14fa3d0ca24444cbdb"
hash-format = "sha512"
mode = "url"
[option]
optional = false
default = false
[update.modrinth]
mod-id = "sdaSaQEz"
version = "iVBf0ICr"
```
So the update.modrinth.mod-id is the one to look up.
"""
import os
import toml
import requests
from yattag import Doc, indent
import requests_cache
import logging
from tqdm import tqdm
import json
import os
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
session = requests_cache.CachedSession('collectmoddescriptions', cache_control=True)
def collect_mod_info(directory):
mods = {}
files = os.listdir(directory)
for filename in tqdm(files, desc="Processing mod files", unit="file"):
if filename.endswith('.toml'):
file_path = os.path.join(directory, filename)
logger.debug(f"Processing file: {file_path}")
with open(file_path, 'r') as file:
data = toml.load(file)
if 'update' in data and 'modrinth' in data['update']:
mod_id = data['update']['modrinth'].get('mod-id')
if mod_id:
url = f"https://api.modrinth.com/v2/project/{mod_id}"
response = session.get(url)
if response.status_code == 200:
project_data = response.json()
mods[mod_id] = project_data
else:
raise Exception(f"Failed to fetch data for mod ID: {mod_id}")
return mods
"""
Calculate the dependency tree of the flat list of mod version ids, such that mods that aren't
depended on by other mods are at the top level, and other mods are nested
under the top-level mods that depend on them (possibly mulitiple times).
In the Modrinth schema, a project has versions, which have dependencies on other
project's versions.
"""
def get_mod_dependencies(version_id):
url = f"https://api.modrinth.com/v2/version/{version_id}"
response = requests.get(url)
if response.status_code == 200:
version_data = response.json()
dependencies = version_data.get('dependencies', [])
return [dep["project_id"] for dep in dependencies if dep.get('dependency_type') == 'required']
else:
print(f"Error fetching version data: {response.status_code}")
return []
def build_dependency_tree(directory):
cache_file = 'dependency_tree_cache.json'
if os.path.exists(cache_file):
with open(cache_file, 'r') as f:
return json.load(f)
dependency_tree = {}
files = os.listdir(directory)
for filename in tqdm(files, desc="Building dependency tree", unit="file"):
if filename.endswith('.toml'):
file_path = os.path.join(directory, filename)
with open(file_path, 'r') as file:
data = toml.load(file)
if 'update' in data and 'modrinth' in data['update']:
mod_id = data['update']['modrinth'].get('mod-id')
version_id = data['update']['modrinth'].get('version')
if mod_id and version_id:
dependencies = get_mod_dependencies(version_id)
dependency_tree[mod_id] = {
'name': data['name'],
'version_id': version_id,
'dependencies': dependencies
}
with open(cache_file, 'w') as f:
json.dump(dependency_tree, f)
return dependency_tree
def get_modrinth_url(slug):
return f"https://modrinth.com/mod/{slug}"
def render_dependency_tree(tree, mod_id, mod_info, level=0):
doc, tag, text = Doc().tagtext()
mod_data = tree.get(mod_id)
if mod_data:
with tag('div', style=f"margin-left: {level * 20}px;"):
with tag('h3'):
with tag('a', href=get_modrinth_url(mod_info[mod_id]['slug'])):
text(mod_data['name'])
for dep_id in mod_data['dependencies']:
if dep_id in tree:
doc.asis(render_dependency_tree(tree, dep_id, mod_info, level + 1))
return doc.getvalue()
def generate_html(mod_info, dependency_tree):
doc, tag, text = Doc().tagtext()
doc.asis('<!DOCTYPE html>')
with tag('html'):
with tag('head'):
with tag('title'):
text('Mod Descriptions and Dependencies')
doc.stag('link', rel='stylesheet', href='pico.min.css')
doc.stag('link', rel='stylesheet', href='style.css')
with tag('main', klass="container"):
with tag('h1'):
text('Mod Descriptions and Dependencies')
with tag('h2'):
text('Mod Descriptions')
with tag('div', id='mod-descriptions'):
for mod_id, info in mod_info.items():
with tag('div', klass='mod-description'):
with tag('h3'):
with tag('a', href=get_modrinth_url(info['slug'])):
text(info['title'])
with tag('p'):
text(f"Category: {info['categories']}")
with tag('p'):
text(info['description'])
with tag('h2'):
text('Mods by Category')
categories = {}
for mod_id, mod_info_dict in mod_info.items():
for category in mod_info_dict['categories']:
if category not in categories:
categories[category] = []
categories[category].append((mod_info_dict['title'], mod_info_dict['slug']))
for category, mods in categories.items():
with tag('h3'):
text(category)
with tag('ul'):
for mod_name, slug in mods:
with tag('li'):
with tag('a', href=get_modrinth_url(slug)):
text(mod_name)
with tag('h2'):
text('Dependency Tree')
for mod_id in dependency_tree:
if not any(mod_id in dep['dependencies'] for dep in dependency_tree.values()):
doc.asis(render_dependency_tree(dependency_tree, mod_id, mod_info))
return indent(doc.getvalue())
def generate_dot_graph(mod_info, dependency_tree):
dot_content = "digraph ModDependencies {\n"
dot_content += " node [shape=box];\n"
for mod_id, info in mod_info.items():
url = get_modrinth_url(info['slug'])
dot_content += f' "{mod_id}" [label="{info["title"]}", URL="{url}"];\n'
for mod_id, deps in dependency_tree.items():
for dep in deps['dependencies']:
dot_content += f' "{mod_id}" -> "{dep}";\n'
dot_content += "}"
return dot_content
def main(directory, output_filename, output_format='html'):
mod_info = collect_mod_info(directory)
dependency_tree = build_dependency_tree(directory)
if output_format == 'html':
output_content = generate_html(mod_info, dependency_tree)
elif output_format == 'json':
import json
output_content = json.dumps({
'mod_info': mod_info,
'dependency_tree': dependency_tree
}, indent=2)
elif output_format == 'dot':
output_content = generate_dot_graph(mod_info, dependency_tree)
else:
raise ValueError("Invalid output format. Use 'html', 'json', or 'dot'.")
with open(output_filename, 'w', encoding='utf-8') as f:
f.write(output_content)
print(f"File with mod descriptions and dependency tree has been generated: {output_filename}")
if __name__ == '__main__':
import sys
if len(sys.argv) != 3:
print("Usage: python script.py <directory> <output_filename>")
print("output_filename should end with '.html', '.json', or '.dot'")
sys.exit(1)
directory = sys.argv[1]
output_filename = sys.argv[2]
if output_filename.lower().endswith('.html'):
output_format = 'html'
elif output_filename.lower().endswith('.json'):
output_format = 'json'
elif output_filename.lower().endswith('.dot'):
output_format = 'dot'
else:
print("Error: output_filename must end with '.html', '.json', or '.dot'")
sys.exit(1)
main(directory, output_filename, output_format)