Add workflows to detect orphaned files and images (#53)

Introduces two GitHub Actions workflows: one to find orphaned markdown files and another to detect unreferenced images in the repository. These checks run on pull requests affecting markdown files and help maintain documentation and asset hygiene by surfacing unused files.
This commit is contained in:
Ian Bassi
2025-12-04 12:53:10 -03:00
committed by GitHub
parent 8660046ccf
commit fb748ebee4
2 changed files with 455 additions and 0 deletions

229
.github/workflows/orphaned_files.yml vendored Normal file
View File

@@ -0,0 +1,229 @@
name: Find Orphaned Markdown Files
on:
pull_request:
paths:
- '**/*.md'
- '**/*.markdown'
- '**/*.mdown'
- '**/*.mkd'
- '**/*.mkdn'
- '**/*.mdx'
workflow_dispatch: {}
jobs:
orphaned-check:
runs-on: ubuntu-latest
permissions:
contents: read
env:
ERROR_BLOCK: ''
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Find orphaned markdown docs
id: find_orphaned
uses: actions/github-script@v8
with:
script: |
const fs = require('fs');
const path = require('path');
const workspace = process.cwd();
const workspaceRoot = path.resolve(workspace);
const allowedExt = new Set(['.md', '.markdown', '.mdown', '.mkd', '.mkdn', '.mdx']);
function collectMarkdownFiles(relativeDir) {
const files = [];
const absoluteDir = relativeDir ? path.join(workspaceRoot, relativeDir) : workspaceRoot;
let entries;
try {
entries = fs.readdirSync(absoluteDir, { withFileTypes: true });
} catch (_) {
return files;
}
for (const entry of entries) {
if (entry.name === '.git') continue;
if (entry.isDirectory() && entry.name.startsWith('.')) continue; // skip hidden directories like .github
const relPath = relativeDir ? `${relativeDir}/${entry.name}` : entry.name;
if (entry.isDirectory()) {
files.push(...collectMarkdownFiles(relPath));
} else if (entry.isFile()) {
const ext = path.extname(entry.name).toLowerCase();
if (allowedExt.has(ext)) files.push(relPath.replace(/\\/g, '/'));
}
}
return files;
}
function lineFromIndex(text, index) {
let line = 1;
for (let i = 0; i < index; i += 1) {
if (text.charCodeAt(i) === 10) line += 1;
}
return line;
}
// Build index for 'basename' => list of paths
const markdownFiles = collectMarkdownFiles('');
if (!markdownFiles.length) {
core.info('No Markdown files found; skipping orphan check.');
return;
}
const nameIndex = new Map();
for (const p of markdownFiles) {
const baseName = path.basename(p, path.extname(p));
const lower = baseName.toLowerCase();
if (!nameIndex.has(lower)) nameIndex.set(lower, []);
nameIndex.get(lower).push(p);
}
// A regex to capture markdown links: [text](url) but ignore images and code blocks
const codeBlockPattern = /^```+([\s\S]*?)^```+$/gm;
const markdownLinkPattern = /(?<!\!)\[(?:[^\[\]]|\[[^\[\]]*\])*\]\(\s*(<[^>]+>|[^)\s]+)(?:\s+"[^"]*")?\s*\)/g;
const links = [];
const fileContents = new Map();
for (const filePath of markdownFiles) {
const absolute = path.join(workspaceRoot, filePath);
let text = fs.readFileSync(absolute, 'utf8');
fileContents.set(filePath, text);
text = text.replace(codeBlockPattern, '');
markdownLinkPattern.lastIndex = 0;
let m;
while ((m = markdownLinkPattern.exec(text)) !== null) {
const url = m[2] || m[1];
const idx = m.index;
// Skip images (the negative lookbehind should have helped), but be safe
const prevChar = idx > 0 ? text[idx - 1] : '';
if (prevChar === '!') continue;
links.push({ source: filePath, url: url.trim(), line: lineFromIndex(text, idx) });
}
}
// Helpers
function isExternal(target) {
if (!target) return true;
if (/^[a-zA-Z][a-zA-Z0-9+.-]*:/.test(target)) return true; // scheme: / http(s), etc.
if (target.startsWith('//')) return true; // protocol relative
return false;
}
function normalizeTarget(raw) {
if (!raw) return '';
let t = raw.trim();
if (!t) return '';
if (t.startsWith('<') && t.endsWith('>')) t = t.slice(1, -1).trim();
// Drop query strings (e.g., ?raw=true), we'll still detect underlying path
const qIdx = t.indexOf('?');
if (qIdx !== -1) t = t.slice(0, qIdx);
// Ignore anchor-only refs
if (t.startsWith('#')) return '#';
try {
return decodeURIComponent(t);
} catch (_) {
return t;
}
}
function tryResolve(sourceFile, rawPath) {
// Return absolute repo-relative path if exists, else null
let sanitized = rawPath.replace(/\\/g, '/');
if (sanitized.startsWith('/')) sanitized = sanitized.slice(1);
// Since references never contain paths, only names, always resolve by basename
const lower = path.basename(sanitized, path.extname(sanitized)).toLowerCase();
const matches = nameIndex.get(lower) || [];
if (matches.length === 1) return matches[0];
if (matches.length > 1) {
// prefer candidate in the same folder as source file
const folder = path.dirname(sourceFile);
for (const m of matches) {
if (path.dirname(m) === folder) return m;
}
// ambiguous - return the first match to err on side of counting references
return matches[0];
}
return null;
}
// Counting map
const counts = new Map();
for (const f of markdownFiles) counts.set(f, { home: 0, others: 0 });
for (const link of links) {
const normalized = normalizeTarget(link.url);
if (!normalized || normalized === '#') continue; // ignore anchors, empty
if (isExternal(normalized)) continue;
// separate fragment only once
const hash = normalized.indexOf('#');
const docPart = hash === -1 ? normalized : normalized.slice(0, hash);
const resolved = tryResolve(link.source, docPart);
if (!resolved) continue;
// skip referencing itself
if (resolved === link.source) continue;
if (!counts.has(resolved)) {
// may be referencing a file with different extension or missing; ignore
continue;
}
const isHome = link.source.toLowerCase().includes('home.md');
const entry = counts.get(resolved);
if (isHome) entry.home += 1; else entry.others += 1;
}
// Build ranking = list of files sorted by total refs (home + others) descending
const rankingArray = [];
const excludedFromRanking = ['home.md', 'readme.md'];
for (const [f, obj] of counts) {
if (excludedFromRanking.includes(path.basename(f).toLowerCase())) {
// keep Home.md and README.md out of ranking as they are not referenced or never referenced
continue;
}
rankingArray.push({ file: f, home: obj.home, others: obj.others, total: (obj.home + obj.others) });
}
rankingArray.sort((a, b) => {
const diff = b.total - a.total;
if (diff !== 0) return diff;
return a.file.localeCompare(b.file);
});
const rankingLines = rankingArray.map(r => `${r.file}, ${r.home}, ${r.others} (${r.total})`);
core.exportVariable('RANKING_BLOCK', rankingLines.join('\n'));
// Gather orphaned = files with both counts zero except home.md itself
const orphanLines = [];
const excludedFromOrphans = ['home.md', 'readme.md'];
for (const [f, obj] of counts) {
if (excludedFromOrphans.includes(path.basename(f).toLowerCase())) continue; // skip home.md and readme.md
if (obj.home === 0 && obj.others === 0) {
const name = f;
orphanLines.push(`${name}, ${obj.home}, ${obj.others}`);
}
}
if (orphanLines.length) {
const block = orphanLines.join('\n');
core.exportVariable('ERROR_BLOCK', block);
core.info(`Found ${orphanLines.length} orphaned markdown file(s).`);
return;
}
core.exportVariable('ERROR_BLOCK', '');
core.info('No orphaned markdown files found.');
- name: Show reference ranking
run: |
echo 'Markdown files ranking (from most to least referenced):'
printf '```\n%s\n```\n' "$RANKING_BLOCK"
- name: Show orphaned files
if: env.ERROR_BLOCK != ''
run: |
echo 'Orphaned markdown files (Name, [refs in Home.md], [refs in other files]):'
printf '```\n%s\n```\n' "$ERROR_BLOCK"
exit 1

View File

@@ -0,0 +1,226 @@
name: Find Unreferenced Images
on:
pull_request:
paths:
- '**/*.md'
- '**/*.markdown'
- '**/*.mdown'
- '**/*.mkd'
- '**/*.mkdn'
- '**/*.mdx'
workflow_dispatch: {}
jobs:
unreferenced-images:
runs-on: ubuntu-latest
permissions:
contents: read
env:
ERROR_BLOCK: ''
RANKING_BLOCK: ''
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Find unreferenced images in /images
id: find_images
uses: actions/github-script@v8
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
script: |
const fs = require('fs');
const path = require('path');
const workspace = process.cwd();
const workspaceRoot = path.resolve(workspace);
const currentRepo = context.repo.repo;
const currentOwner = context.repo.owner;
const allowedImageExt = new Set(['.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.bmp', '.ico', '.avif']);
const allowedMarkdownExt = new Set(['.md', '.markdown', '.mdown', '.mkd', '.mkdn', '.mdx']);
function collectFilesUnder(relativeDir, extSet) {
const files = [];
const absoluteDir = relativeDir ? path.join(workspaceRoot, relativeDir) : workspaceRoot;
let entries;
try {
entries = fs.readdirSync(absoluteDir, { withFileTypes: true });
} catch (_) {
return files;
}
for (const entry of entries) {
if (entry.name === '.git') continue;
const rel = relativeDir ? `${relativeDir}/${entry.name}` : entry.name;
if (entry.isDirectory()) {
files.push(...collectFilesUnder(rel, extSet));
} else if (entry.isFile()) {
const ext = path.extname(entry.name).toLowerCase();
if (extSet.has(ext)) files.push(rel.replace(/\\/g, '/'));
}
}
return files;
}
function lineFromIndex(text, index) {
let line = 1;
for (let i = 0; i < index; i += 1) {
if (text.charCodeAt(i) === 10) line += 1;
}
return line;
}
// Gather images under images/ folder
const candidateImages = collectFilesUnder('images', allowedImageExt).map(p => p.replace(/\\/g, '/'));
if (!candidateImages.length) {
core.info('No images found under images/; skipping unreferenced image check.');
return;
}
// Build a map of image -> count
const counts = new Map();
for (const img of candidateImages) counts.set(img, 0);
// Gather markdown files to scan
const markdownFiles = collectFilesUnder('', allowedMarkdownExt);
if (!markdownFiles.length) {
core.info('No Markdown files found; skipping references scan.');
}
const codeBlockPattern = /^```+([\s\S]*?)^```+$/gm;
const markdownImagePattern = /!\[(?:[^\]]*)\]\(\s*([^\)\s]+)(?:\s+"[^"]*")?\s*\)/g;
const htmlImagePattern = /<img\b[^>]*>/gi;
function parseGithubRawLink(rawUrl) {
let parsed;
try { parsed = new URL(rawUrl); } catch (_) { return null; }
const hostname = parsed.hostname.toLowerCase();
if (hostname === 'raw.githubusercontent.com') {
const parts = parsed.pathname.split('/').filter(Boolean);
if (parts.length < 3) return null;
const owner = parts[0];
const repo = parts[1];
const ref = decodeURIComponent(parts[2]);
const rel = parts.slice(3).map(decodeURIComponent).join('/');
return { owner, repo, ref, path: rel };
}
if (hostname === 'github.com') {
const parts = parsed.pathname.split('/').filter(Boolean);
if (parts.length < 5) return null;
const owner = parts[0];
const repo = parts[1];
const blobOrRaw = parts[2];
if (!['raw', 'blob'].includes(blobOrRaw)) return null;
const ref = decodeURIComponent(parts[3]);
const rel = parts.slice(4).map(decodeURIComponent).join('/');
return { owner, repo, ref, path: rel };
}
return null;
}
function normalizeLocalPath(sourceFile, raw) {
if (!raw) return null;
let t = raw.trim();
if (t.startsWith('<') && t.endsWith('>')) t = t.slice(1, -1).trim();
// drop query string
const q = t.indexOf('?'); if (q !== -1) t = t.slice(0, q);
// absolute repo path
if (t.startsWith('/')) {
const rel = t.slice(1).replace(/\\/g, '/');
return rel;
}
// relative paths from source file
const candidate = path.normalize(path.join(path.dirname(sourceFile), t));
const relToRoot = path.relative(workspaceRoot, path.resolve(workspaceRoot, candidate)).replace(/\\/g, '/');
return relToRoot;
}
// Iterate markdown files and accumulate counts
for (const file of markdownFiles) {
const absolute = path.join(workspaceRoot, file);
let text = fs.readFileSync(absolute, 'utf8');
const textWithoutCodeBlocks = text.replace(codeBlockPattern, '');
// Markdown-style images: ![alt](url)
markdownImagePattern.lastIndex = 0;
let m;
while ((m = markdownImagePattern.exec(textWithoutCodeBlocks)) !== null) {
const url = m[1];
if (!url) continue;
// If url is a raw github link to this repo, parse it
const repoPath = parseGithubRawLink(url);
if (repoPath && repoPath.owner === currentOwner && repoPath.repo === currentRepo) {
const normalized = repoPath.path.replace(/\\/g, '/');
if (counts.has(normalized)) counts.set(normalized, counts.get(normalized) + 1);
continue;
}
// Local path
const local = normalizeLocalPath(file, url);
if (!local) continue;
// Try the path directly
if (counts.has(local)) counts.set(local, counts.get(local) + 1);
else {
// Fallback: match by basename only when unique to avoid ambiguous counting
const base = path.basename(local).toLowerCase();
const matches = Array.from(counts.keys()).filter(img => path.basename(img).toLowerCase() === base);
if (matches.length === 1) counts.set(matches[0], counts.get(matches[0]) + 1);
}
}
// <img src="..."> parsing
htmlImagePattern.lastIndex = 0;
while ((m = htmlImagePattern.exec(textWithoutCodeBlocks)) !== null) {
const tag = m[0];
// extract src
const attrPattern = /src\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+))/i;
const match = attrPattern.exec(tag);
const url = match && (match[1] || match[2] || match[3]) ? (match[1] || match[2] || match[3]) : null;
if (!url) continue;
const repoPath = parseGithubRawLink(url);
if (repoPath && repoPath.owner === currentOwner && repoPath.repo === currentRepo) {
const normalized = repoPath.path.replace(/\\/g, '/');
if (counts.has(normalized)) counts.set(normalized, counts.get(normalized) + 1);
continue;
}
const local = normalizeLocalPath(file, url);
if (!local) continue;
if (counts.has(local)) counts.set(local, counts.get(local) + 1);
else {
const base = path.basename(local).toLowerCase();
const matches = Array.from(counts.keys()).filter(img => path.basename(img).toLowerCase() === base);
if (matches.length === 1) counts.set(matches[0], counts.get(matches[0]) + 1);
}
}
}
// Generate ranking of images by count
const ranking = [];
for (const [img, cnt] of counts) ranking.push({ img, cnt });
ranking.sort((a, b) => b.cnt - a.cnt || a.img.localeCompare(b.img));
const rankingLines = ranking.map(r => `${r.img}, ${r.cnt}`);
core.exportVariable('RANKING_BLOCK', rankingLines.join('\n'));
// Find images with 0 references
const unreferenced = ranking.filter(r => r.cnt === 0).map(r => r.img);
if (unreferenced.length) {
core.exportVariable('ERROR_BLOCK', unreferenced.join('\n'));
core.info(`Found ${unreferenced.length} unreferenced image(s) in images/`);
return;
}
core.exportVariable('ERROR_BLOCK', '');
core.info('No unreferenced images found in images/.');
- name: Show image ranking
run: |
echo 'Image ranking (image path, number of references):'
printf '```\n%s\n```\n' "$RANKING_BLOCK"
- name: Show unreferenced images
if: env.ERROR_BLOCK != ''
run: |
echo 'Unreferenced images under images/ (image path):'
printf '```\n%s\n```\n' "$ERROR_BLOCK"
exit 1