add _extensions

This commit is contained in:
2026-05-21 13:37:53 +08:00
parent 6a9a5fc90e
commit 61bd0bea2f
252 changed files with 33972 additions and 1 deletions
@@ -0,0 +1,254 @@
"""
Citation Tools for Academic Writing
This script provides utilities for managing citations in academic writing:
1. Extract citation keys from Markdown files and create a filtered bibliography
2. Copy cited reference files to a specified directory for backup or sharing
Typical usage:
python citation-tools.py --extract
python citation-tools.py --copy
Copyright: © 2025Present Tom Ben
License: MIT License
"""
import os
import re
import shutil
import argparse
import json
from pathlib import Path
def extract_citation_keys(markdown_file):
"""Extract citation keys from a markdown file."""
with open(markdown_file, 'r', encoding='utf-8') as f:
content = f.read()
# Pattern 1: [@key] or [@key1; @key2] format
pattern1 = r'\[@([a-zA-Z0-9\-]+)(?:[\s\]\;\,]|$)'
# Pattern 2: standalone @key format
pattern2 = r'(?<![a-zA-Z0-9])@([a-zA-Z0-9\-]+)(?:[\s\.\,\;\:\)\]\}]|$)'
keys1 = re.findall(pattern1, content)
keys2 = re.findall(pattern2, content)
# Combine keys and filter out figure and table references
all_keys = set(keys1 + keys2)
return {key for key in all_keys if not (
key.startswith('fig-') or key.startswith('tbl-'))}
def load_csl_entries(csl_json_file):
"""Load CSL JSON entries from file."""
with open(csl_json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
if not isinstance(data, list):
raise ValueError(
f"Expected a list of CSL JSON entries, got {type(data)} instead.")
return data
def parse_file_field(file_field):
"""Return a list of file paths from a CSL JSON file field."""
if not file_field or not isinstance(file_field, str):
return []
return [path.strip() for path in file_field.split(';') if path.strip()]
def build_citation_file_index(entries):
"""Build a dictionary mapping citation IDs to attached file paths."""
index = {}
for entry in entries:
key = entry.get('id')
if not key:
continue
paths = parse_file_field(entry.get('file'))
if paths:
index[key] = paths
return index
def extract_csl_json_entries(csl_json_file, citation_keys, remove_fields=None):
"""Extract CSL JSON entries for the given citation keys."""
if remove_fields is None:
remove_fields = ['file']
entries = load_csl_entries(csl_json_file)
citation_keys = set(citation_keys)
filtered_entries = []
for entry in entries:
key = entry.get('id')
if key and key in citation_keys:
entry_copy = {k: v for k, v in entry.items()
if k not in remove_fields}
filtered_entries.append(entry_copy)
filtered_entries.sort(key=lambda item: item.get('id', ''))
return json.dumps(filtered_entries, ensure_ascii=False, indent=2) + '\n'
def copy_cited_files(args):
"""Copy cited files from bibliography to a new folder."""
# Clean output directory if requested
if args.clean and os.path.exists(args.output_dir):
print(f"Cleaning output directory: {args.output_dir}")
shutil.rmtree(args.output_dir)
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
# Parse bibliography (silently)
entries = load_csl_entries(args.bib)
citation_files = build_citation_file_index(entries)
# Find all Markdown files in content directory
markdown_files = list(Path(args.content_dir).glob('[0-9]*.md'))
# Extract all citation keys from Markdown files
all_keys = set()
for md_file in markdown_files:
all_keys.update(extract_citation_keys(md_file))
# Copy files to output directory
copied_count = 0
missing_count = 0
file_not_found_count = 0
missing_keys = []
not_found_pairs = []
for key in all_keys:
if key in citation_files:
paths = citation_files[key]
existing_path = next(
(path for path in paths if os.path.exists(path)), None)
source_path = existing_path or paths[0]
_, file_extension = os.path.splitext(source_path)
dest_path = os.path.join(args.output_dir, f"{key}{file_extension}")
try:
if existing_path and os.path.exists(existing_path):
shutil.copy2(existing_path, dest_path)
copied_count += 1
else:
file_not_found_count += 1
not_found_pairs.append((key, source_path))
except Exception as e:
print(f"Error copying {key}: {e}")
else:
missing_count += 1
missing_keys.append(key)
# Print simplified summary
print(f"Markdown files in content directory: {len(markdown_files)}")
print(f"Total unique citation keys found: {len(all_keys)}")
print(f"Files successfully copied: {copied_count}")
print(f"Citation keys without file paths: {missing_count}")
print(
f"Files not found (path exists in bibliography but file missing): {file_not_found_count}")
if missing_keys:
print("\nCitation keys without file paths:")
for key in sorted(missing_keys):
print(f" - {key}")
if not_found_pairs:
print("\nCitation keys where file wasn't found:")
for key, path in sorted(not_found_pairs):
print(f" - {key}: {path}")
return all_keys
def extract_citations(args):
"""Extract citations from Markdown files and save them to a CSL JSON file."""
# Find all Markdown files in content directory
markdown_files = list(Path(args.content_dir).glob('[0-9]*.md'))
# Extract all citation keys from Markdown files
all_keys = set()
for md_file in markdown_files:
all_keys.update(extract_citation_keys(md_file))
# Extract CSL JSON entries
json_content = extract_csl_json_entries(
args.bib, all_keys, args.remove_fields)
# Write to output file
with open(args.output_bib, 'w', encoding='utf-8') as f:
f.write(json_content)
# Print simplified summary
print(f"Markdown files in content directory: {len(markdown_files)}")
print(f"Total unique citation keys found: {len(all_keys)}")
print(f"Extracted citations to `{args.output_bib}`")
return all_keys
def main():
"""Parse command line arguments and execute the appropriate function."""
# Get script location and project root
script_dir = Path(__file__).parent.resolve()
project_root = script_dir.parent if script_dir.name == "_extensions" else script_dir
parser = argparse.ArgumentParser(
description='Citation tools for extracting and copying cited references')
# Common arguments
default_bib = os.path.expanduser(
"~/Library/CloudStorage/Dropbox/pkm/bibliography.json")
default_content_dir = str(project_root / "contents")
# Add command flags instead of subcommands
parser.add_argument('--extract', action='store_true',
help='Extract citations to a filtered CSL JSON file')
parser.add_argument('--copy', action='store_true',
help='Copy cited files to a directory')
# Common arguments for both commands
parser.add_argument('--bib',
default=default_bib,
help=f'Path to bibliography.json file (default: {default_bib})')
parser.add_argument('--content_dir',
default=default_content_dir,
help=f'Path to content directory with Markdown files (default: {default_content_dir})')
# Arguments specific to extract
parser.add_argument('--output_bib',
default=str(project_root / "citebib.json"),
help=f'Path to output CSL JSON file (default: {project_root}/citebib.json)')
parser.add_argument('--remove_fields',
nargs='+',
default=['file'],
help='Fields to remove from CSL JSON entries (default: file)')
# Arguments specific to copy
parser.add_argument('--output_dir',
default=os.path.expanduser(
"~/Downloads/cited-docs"),
help='Path to output directory for copied files (default: ~/Downloads/cited-docs)')
parser.add_argument('--clean',
action='store_true',
help='Clean the output directory before copying files')
args = parser.parse_args()
if args.extract:
extract_citations(args)
elif args.copy:
copy_cited_files(args)
else:
parser.print_help()
if __name__ == "__main__":
main()