Files
su2026rwep/_extensions/drwater/quartocn/citation-tools.py
T
2026-05-21 13:37:53 +08:00

255 lines
8.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Citation Tools for Academic Writing
This script provides utilities for managing citations in academic writing:
1. Extract citation keys from Markdown files and create a filtered bibliography
2. Copy cited reference files to a specified directory for backup or sharing
Typical usage:
python citation-tools.py --extract
python citation-tools.py --copy
Copyright: © 2025Present Tom Ben
License: MIT License
"""
import os
import re
import shutil
import argparse
import json
from pathlib import Path
def extract_citation_keys(markdown_file):
"""Extract citation keys from a markdown file."""
with open(markdown_file, 'r', encoding='utf-8') as f:
content = f.read()
# Pattern 1: [@key] or [@key1; @key2] format
pattern1 = r'\[@([a-zA-Z0-9\-]+)(?:[\s\]\;\,]|$)'
# Pattern 2: standalone @key format
pattern2 = r'(?<![a-zA-Z0-9])@([a-zA-Z0-9\-]+)(?:[\s\.\,\;\:\)\]\}]|$)'
keys1 = re.findall(pattern1, content)
keys2 = re.findall(pattern2, content)
# Combine keys and filter out figure and table references
all_keys = set(keys1 + keys2)
return {key for key in all_keys if not (
key.startswith('fig-') or key.startswith('tbl-'))}
def load_csl_entries(csl_json_file):
"""Load CSL JSON entries from file."""
with open(csl_json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
if not isinstance(data, list):
raise ValueError(
f"Expected a list of CSL JSON entries, got {type(data)} instead.")
return data
def parse_file_field(file_field):
"""Return a list of file paths from a CSL JSON file field."""
if not file_field or not isinstance(file_field, str):
return []
return [path.strip() for path in file_field.split(';') if path.strip()]
def build_citation_file_index(entries):
"""Build a dictionary mapping citation IDs to attached file paths."""
index = {}
for entry in entries:
key = entry.get('id')
if not key:
continue
paths = parse_file_field(entry.get('file'))
if paths:
index[key] = paths
return index
def extract_csl_json_entries(csl_json_file, citation_keys, remove_fields=None):
"""Extract CSL JSON entries for the given citation keys."""
if remove_fields is None:
remove_fields = ['file']
entries = load_csl_entries(csl_json_file)
citation_keys = set(citation_keys)
filtered_entries = []
for entry in entries:
key = entry.get('id')
if key and key in citation_keys:
entry_copy = {k: v for k, v in entry.items()
if k not in remove_fields}
filtered_entries.append(entry_copy)
filtered_entries.sort(key=lambda item: item.get('id', ''))
return json.dumps(filtered_entries, ensure_ascii=False, indent=2) + '\n'
def copy_cited_files(args):
"""Copy cited files from bibliography to a new folder."""
# Clean output directory if requested
if args.clean and os.path.exists(args.output_dir):
print(f"Cleaning output directory: {args.output_dir}")
shutil.rmtree(args.output_dir)
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
# Parse bibliography (silently)
entries = load_csl_entries(args.bib)
citation_files = build_citation_file_index(entries)
# Find all Markdown files in content directory
markdown_files = list(Path(args.content_dir).glob('[0-9]*.md'))
# Extract all citation keys from Markdown files
all_keys = set()
for md_file in markdown_files:
all_keys.update(extract_citation_keys(md_file))
# Copy files to output directory
copied_count = 0
missing_count = 0
file_not_found_count = 0
missing_keys = []
not_found_pairs = []
for key in all_keys:
if key in citation_files:
paths = citation_files[key]
existing_path = next(
(path for path in paths if os.path.exists(path)), None)
source_path = existing_path or paths[0]
_, file_extension = os.path.splitext(source_path)
dest_path = os.path.join(args.output_dir, f"{key}{file_extension}")
try:
if existing_path and os.path.exists(existing_path):
shutil.copy2(existing_path, dest_path)
copied_count += 1
else:
file_not_found_count += 1
not_found_pairs.append((key, source_path))
except Exception as e:
print(f"Error copying {key}: {e}")
else:
missing_count += 1
missing_keys.append(key)
# Print simplified summary
print(f"Markdown files in content directory: {len(markdown_files)}")
print(f"Total unique citation keys found: {len(all_keys)}")
print(f"Files successfully copied: {copied_count}")
print(f"Citation keys without file paths: {missing_count}")
print(
f"Files not found (path exists in bibliography but file missing): {file_not_found_count}")
if missing_keys:
print("\nCitation keys without file paths:")
for key in sorted(missing_keys):
print(f" - {key}")
if not_found_pairs:
print("\nCitation keys where file wasn't found:")
for key, path in sorted(not_found_pairs):
print(f" - {key}: {path}")
return all_keys
def extract_citations(args):
"""Extract citations from Markdown files and save them to a CSL JSON file."""
# Find all Markdown files in content directory
markdown_files = list(Path(args.content_dir).glob('[0-9]*.md'))
# Extract all citation keys from Markdown files
all_keys = set()
for md_file in markdown_files:
all_keys.update(extract_citation_keys(md_file))
# Extract CSL JSON entries
json_content = extract_csl_json_entries(
args.bib, all_keys, args.remove_fields)
# Write to output file
with open(args.output_bib, 'w', encoding='utf-8') as f:
f.write(json_content)
# Print simplified summary
print(f"Markdown files in content directory: {len(markdown_files)}")
print(f"Total unique citation keys found: {len(all_keys)}")
print(f"Extracted citations to `{args.output_bib}`")
return all_keys
def main():
"""Parse command line arguments and execute the appropriate function."""
# Get script location and project root
script_dir = Path(__file__).parent.resolve()
project_root = script_dir.parent if script_dir.name == "_extensions" else script_dir
parser = argparse.ArgumentParser(
description='Citation tools for extracting and copying cited references')
# Common arguments
default_bib = os.path.expanduser(
"~/Library/CloudStorage/Dropbox/pkm/bibliography.json")
default_content_dir = str(project_root / "contents")
# Add command flags instead of subcommands
parser.add_argument('--extract', action='store_true',
help='Extract citations to a filtered CSL JSON file')
parser.add_argument('--copy', action='store_true',
help='Copy cited files to a directory')
# Common arguments for both commands
parser.add_argument('--bib',
default=default_bib,
help=f'Path to bibliography.json file (default: {default_bib})')
parser.add_argument('--content_dir',
default=default_content_dir,
help=f'Path to content directory with Markdown files (default: {default_content_dir})')
# Arguments specific to extract
parser.add_argument('--output_bib',
default=str(project_root / "citebib.json"),
help=f'Path to output CSL JSON file (default: {project_root}/citebib.json)')
parser.add_argument('--remove_fields',
nargs='+',
default=['file'],
help='Fields to remove from CSL JSON entries (default: file)')
# Arguments specific to copy
parser.add_argument('--output_dir',
default=os.path.expanduser(
"~/Downloads/cited-docs"),
help='Path to output directory for copied files (default: ~/Downloads/cited-docs)')
parser.add_argument('--clean',
action='store_true',
help='Clean the output directory before copying files')
args = parser.parse_args()
if args.extract:
extract_citations(args)
elif args.copy:
copy_cited_files(args)
else:
parser.print_help()
if __name__ == "__main__":
main()