add _extensions
This commit is contained in:
@@ -0,0 +1,254 @@
|
||||
"""
|
||||
Citation Tools for Academic Writing
|
||||
|
||||
This script provides utilities for managing citations in academic writing:
|
||||
1. Extract citation keys from Markdown files and create a filtered bibliography
|
||||
2. Copy cited reference files to a specified directory for backup or sharing
|
||||
|
||||
Typical usage:
|
||||
python citation-tools.py --extract
|
||||
python citation-tools.py --copy
|
||||
|
||||
Copyright: © 2025–Present Tom Ben
|
||||
License: MIT License
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def extract_citation_keys(markdown_file):
|
||||
"""Extract citation keys from a markdown file."""
|
||||
with open(markdown_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Pattern 1: [@key] or [@key1; @key2] format
|
||||
pattern1 = r'\[@([a-zA-Z0-9\-]+)(?:[\s\]\;\,]|$)'
|
||||
|
||||
# Pattern 2: standalone @key format
|
||||
pattern2 = r'(?<![a-zA-Z0-9])@([a-zA-Z0-9\-]+)(?:[\s\.\,\;\:\)\]\}]|$)'
|
||||
|
||||
keys1 = re.findall(pattern1, content)
|
||||
keys2 = re.findall(pattern2, content)
|
||||
|
||||
# Combine keys and filter out figure and table references
|
||||
all_keys = set(keys1 + keys2)
|
||||
return {key for key in all_keys if not (
|
||||
key.startswith('fig-') or key.startswith('tbl-'))}
|
||||
|
||||
|
||||
def load_csl_entries(csl_json_file):
|
||||
"""Load CSL JSON entries from file."""
|
||||
with open(csl_json_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
if not isinstance(data, list):
|
||||
raise ValueError(
|
||||
f"Expected a list of CSL JSON entries, got {type(data)} instead.")
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def parse_file_field(file_field):
|
||||
"""Return a list of file paths from a CSL JSON file field."""
|
||||
if not file_field or not isinstance(file_field, str):
|
||||
return []
|
||||
|
||||
return [path.strip() for path in file_field.split(';') if path.strip()]
|
||||
|
||||
|
||||
def build_citation_file_index(entries):
|
||||
"""Build a dictionary mapping citation IDs to attached file paths."""
|
||||
index = {}
|
||||
|
||||
for entry in entries:
|
||||
key = entry.get('id')
|
||||
if not key:
|
||||
continue
|
||||
paths = parse_file_field(entry.get('file'))
|
||||
if paths:
|
||||
index[key] = paths
|
||||
|
||||
return index
|
||||
|
||||
|
||||
def extract_csl_json_entries(csl_json_file, citation_keys, remove_fields=None):
|
||||
"""Extract CSL JSON entries for the given citation keys."""
|
||||
if remove_fields is None:
|
||||
remove_fields = ['file']
|
||||
|
||||
entries = load_csl_entries(csl_json_file)
|
||||
citation_keys = set(citation_keys)
|
||||
filtered_entries = []
|
||||
|
||||
for entry in entries:
|
||||
key = entry.get('id')
|
||||
if key and key in citation_keys:
|
||||
entry_copy = {k: v for k, v in entry.items()
|
||||
if k not in remove_fields}
|
||||
filtered_entries.append(entry_copy)
|
||||
|
||||
filtered_entries.sort(key=lambda item: item.get('id', ''))
|
||||
return json.dumps(filtered_entries, ensure_ascii=False, indent=2) + '\n'
|
||||
|
||||
|
||||
def copy_cited_files(args):
|
||||
"""Copy cited files from bibliography to a new folder."""
|
||||
# Clean output directory if requested
|
||||
if args.clean and os.path.exists(args.output_dir):
|
||||
print(f"Cleaning output directory: {args.output_dir}")
|
||||
shutil.rmtree(args.output_dir)
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# Parse bibliography (silently)
|
||||
entries = load_csl_entries(args.bib)
|
||||
citation_files = build_citation_file_index(entries)
|
||||
|
||||
# Find all Markdown files in content directory
|
||||
markdown_files = list(Path(args.content_dir).glob('[0-9]*.md'))
|
||||
|
||||
# Extract all citation keys from Markdown files
|
||||
all_keys = set()
|
||||
for md_file in markdown_files:
|
||||
all_keys.update(extract_citation_keys(md_file))
|
||||
|
||||
# Copy files to output directory
|
||||
copied_count = 0
|
||||
missing_count = 0
|
||||
file_not_found_count = 0
|
||||
missing_keys = []
|
||||
not_found_pairs = []
|
||||
|
||||
for key in all_keys:
|
||||
if key in citation_files:
|
||||
paths = citation_files[key]
|
||||
existing_path = next(
|
||||
(path for path in paths if os.path.exists(path)), None)
|
||||
source_path = existing_path or paths[0]
|
||||
_, file_extension = os.path.splitext(source_path)
|
||||
dest_path = os.path.join(args.output_dir, f"{key}{file_extension}")
|
||||
|
||||
try:
|
||||
if existing_path and os.path.exists(existing_path):
|
||||
shutil.copy2(existing_path, dest_path)
|
||||
copied_count += 1
|
||||
else:
|
||||
file_not_found_count += 1
|
||||
not_found_pairs.append((key, source_path))
|
||||
except Exception as e:
|
||||
print(f"Error copying {key}: {e}")
|
||||
else:
|
||||
missing_count += 1
|
||||
missing_keys.append(key)
|
||||
|
||||
# Print simplified summary
|
||||
print(f"Markdown files in content directory: {len(markdown_files)}")
|
||||
print(f"Total unique citation keys found: {len(all_keys)}")
|
||||
print(f"Files successfully copied: {copied_count}")
|
||||
print(f"Citation keys without file paths: {missing_count}")
|
||||
print(
|
||||
f"Files not found (path exists in bibliography but file missing): {file_not_found_count}")
|
||||
|
||||
if missing_keys:
|
||||
print("\nCitation keys without file paths:")
|
||||
for key in sorted(missing_keys):
|
||||
print(f" - {key}")
|
||||
|
||||
if not_found_pairs:
|
||||
print("\nCitation keys where file wasn't found:")
|
||||
for key, path in sorted(not_found_pairs):
|
||||
print(f" - {key}: {path}")
|
||||
|
||||
return all_keys
|
||||
|
||||
|
||||
def extract_citations(args):
|
||||
"""Extract citations from Markdown files and save them to a CSL JSON file."""
|
||||
# Find all Markdown files in content directory
|
||||
markdown_files = list(Path(args.content_dir).glob('[0-9]*.md'))
|
||||
|
||||
# Extract all citation keys from Markdown files
|
||||
all_keys = set()
|
||||
for md_file in markdown_files:
|
||||
all_keys.update(extract_citation_keys(md_file))
|
||||
|
||||
# Extract CSL JSON entries
|
||||
json_content = extract_csl_json_entries(
|
||||
args.bib, all_keys, args.remove_fields)
|
||||
|
||||
# Write to output file
|
||||
with open(args.output_bib, 'w', encoding='utf-8') as f:
|
||||
f.write(json_content)
|
||||
|
||||
# Print simplified summary
|
||||
print(f"Markdown files in content directory: {len(markdown_files)}")
|
||||
print(f"Total unique citation keys found: {len(all_keys)}")
|
||||
print(f"Extracted citations to `{args.output_bib}`")
|
||||
|
||||
return all_keys
|
||||
|
||||
|
||||
def main():
|
||||
"""Parse command line arguments and execute the appropriate function."""
|
||||
# Get script location and project root
|
||||
script_dir = Path(__file__).parent.resolve()
|
||||
project_root = script_dir.parent if script_dir.name == "_extensions" else script_dir
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Citation tools for extracting and copying cited references')
|
||||
|
||||
# Common arguments
|
||||
default_bib = os.path.expanduser(
|
||||
"~/Library/CloudStorage/Dropbox/pkm/bibliography.json")
|
||||
default_content_dir = str(project_root / "contents")
|
||||
|
||||
# Add command flags instead of subcommands
|
||||
parser.add_argument('--extract', action='store_true',
|
||||
help='Extract citations to a filtered CSL JSON file')
|
||||
parser.add_argument('--copy', action='store_true',
|
||||
help='Copy cited files to a directory')
|
||||
|
||||
# Common arguments for both commands
|
||||
parser.add_argument('--bib',
|
||||
default=default_bib,
|
||||
help=f'Path to bibliography.json file (default: {default_bib})')
|
||||
parser.add_argument('--content_dir',
|
||||
default=default_content_dir,
|
||||
help=f'Path to content directory with Markdown files (default: {default_content_dir})')
|
||||
|
||||
# Arguments specific to extract
|
||||
parser.add_argument('--output_bib',
|
||||
default=str(project_root / "citebib.json"),
|
||||
help=f'Path to output CSL JSON file (default: {project_root}/citebib.json)')
|
||||
parser.add_argument('--remove_fields',
|
||||
nargs='+',
|
||||
default=['file'],
|
||||
help='Fields to remove from CSL JSON entries (default: file)')
|
||||
|
||||
# Arguments specific to copy
|
||||
parser.add_argument('--output_dir',
|
||||
default=os.path.expanduser(
|
||||
"~/Downloads/cited-docs"),
|
||||
help='Path to output directory for copied files (default: ~/Downloads/cited-docs)')
|
||||
parser.add_argument('--clean',
|
||||
action='store_true',
|
||||
help='Clean the output directory before copying files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.extract:
|
||||
extract_citations(args)
|
||||
elif args.copy:
|
||||
copy_cited_files(args)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user