build ocr
This commit is contained in:
@@ -13,6 +13,7 @@
|
||||
"@fastify/static": "^7.0.4",
|
||||
"better-sqlite3": "^9.4.3",
|
||||
"fastify": "^4.27.0",
|
||||
"node-tesseract-ocr": "^2.2.1",
|
||||
"sharp": "^0.33.4",
|
||||
"uuid": "^9.0.1"
|
||||
},
|
||||
|
||||
@@ -34,6 +34,7 @@ db.exec(`
|
||||
height INTEGER NOT NULL,
|
||||
parent_id TEXT REFERENCES memes(id) ON DELETE CASCADE,
|
||||
collection_id INTEGER REFERENCES collections(id) ON DELETE SET NULL,
|
||||
ocr_text TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
@@ -54,15 +55,22 @@ db.exec(`
|
||||
CREATE INDEX IF NOT EXISTS idx_meme_tags_tag_id ON meme_tags(tag_id);
|
||||
`);
|
||||
|
||||
// Migration: add collection_id column if upgrading from earlier schema
|
||||
// Must run BEFORE creating the index on that column
|
||||
// Migrations — run after CREATE TABLE IF NOT EXISTS so they only apply to existing DBs
|
||||
const memesCols = db.prepare('PRAGMA table_info(memes)').all() as { name: string }[];
|
||||
|
||||
if (!memesCols.find((c) => c.name === 'collection_id')) {
|
||||
db.exec('ALTER TABLE memes ADD COLUMN collection_id INTEGER REFERENCES collections(id) ON DELETE SET NULL');
|
||||
}
|
||||
|
||||
// Create index after the column is guaranteed to exist (handles both fresh and migrated DBs)
|
||||
db.exec('CREATE INDEX IF NOT EXISTS idx_memes_collection_id ON memes(collection_id)');
|
||||
if (!memesCols.find((c) => c.name === 'ocr_text')) {
|
||||
db.exec('ALTER TABLE memes ADD COLUMN ocr_text TEXT');
|
||||
}
|
||||
|
||||
// Indexes that depend on migrated columns — created after columns are guaranteed to exist
|
||||
db.exec(`
|
||||
CREATE INDEX IF NOT EXISTS idx_memes_collection_id ON memes(collection_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_memes_ocr ON memes(ocr_text) WHERE ocr_text IS NOT NULL;
|
||||
`);
|
||||
|
||||
// Seed the default UNSORTED collection
|
||||
const defaultCollection = db
|
||||
|
||||
@@ -8,6 +8,7 @@ import { memesRoutes } from './routes/memes.js';
|
||||
import { tagsRoutes } from './routes/tags.js';
|
||||
import { authRoutes } from './routes/auth.js';
|
||||
import { collectionsRoutes } from './routes/collections.js';
|
||||
import { adminRoutes } from './routes/admin.js';
|
||||
|
||||
// Ensure data dirs exist
|
||||
ensureImagesDir();
|
||||
@@ -41,6 +42,7 @@ await app.register(authRoutes);
|
||||
await app.register(collectionsRoutes);
|
||||
await app.register(memesRoutes);
|
||||
await app.register(tagsRoutes);
|
||||
await app.register(adminRoutes);
|
||||
|
||||
// SPA fallback — serve index.html for all non-API, non-image routes
|
||||
app.setNotFoundHandler(async (req, reply) => {
|
||||
|
||||
52
backend/src/routes/admin.ts
Normal file
52
backend/src/routes/admin.ts
Normal file
@@ -0,0 +1,52 @@
|
||||
import type { FastifyInstance } from 'fastify';
|
||||
import db from '../db.js';
|
||||
import { requireAuth } from '../auth.js';
|
||||
import { extractText } from '../services/ocr.js';
|
||||
import type { Meme } from '../types.js';
|
||||
|
||||
export async function adminRoutes(app: FastifyInstance) {
|
||||
/**
|
||||
* POST /api/admin/reindex
|
||||
* Re-runs OCR on every meme that has no ocr_text yet.
|
||||
* Processes sequentially to avoid hammering the CPU.
|
||||
* Returns counts so the caller knows progress.
|
||||
*/
|
||||
app.post('/api/admin/reindex', { preHandler: requireAuth }, async (_req, reply) => {
|
||||
const pending = db
|
||||
.prepare('SELECT id, file_path, mime_type FROM memes WHERE ocr_text IS NULL')
|
||||
.all() as Pick<Meme, 'id' | 'file_path' | 'mime_type'>[];
|
||||
|
||||
reply.raw.setHeader('Content-Type', 'application/json');
|
||||
|
||||
let done = 0;
|
||||
let failed = 0;
|
||||
|
||||
for (const meme of pending) {
|
||||
const text = await extractText(meme.file_path, meme.mime_type);
|
||||
if (text) {
|
||||
db.prepare('UPDATE memes SET ocr_text = ? WHERE id = ?').run(text, meme.id);
|
||||
done++;
|
||||
} else {
|
||||
// Store empty string so it won't be retried on subsequent runs
|
||||
db.prepare("UPDATE memes SET ocr_text = '' WHERE id = ?").run(meme.id);
|
||||
failed++;
|
||||
}
|
||||
}
|
||||
|
||||
return { total: pending.length, indexed: done, no_text_found: failed };
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/admin/reindex/status
|
||||
* Returns how many memes still need OCR indexing.
|
||||
*/
|
||||
app.get('/api/admin/reindex/status', { preHandler: requireAuth }, async () => {
|
||||
const { pending } = db
|
||||
.prepare('SELECT COUNT(*) as pending FROM memes WHERE ocr_text IS NULL')
|
||||
.get() as { pending: number };
|
||||
const { indexed } = db
|
||||
.prepare("SELECT COUNT(*) as indexed FROM memes WHERE ocr_text IS NOT NULL AND ocr_text != ''")
|
||||
.get() as { indexed: number };
|
||||
return { pending, indexed };
|
||||
});
|
||||
}
|
||||
@@ -4,6 +4,7 @@ import { v4 as uuidv4 } from 'uuid';
|
||||
import db, { UNSORTED_ID } from '../db.js';
|
||||
import { buildFilePath, deleteFile, getExtension } from '../services/storage.js';
|
||||
import { extractMeta, resizeImage, saveBuffer } from '../services/image.js';
|
||||
import { extractText } from '../services/ocr.js';
|
||||
import { requireAuth } from '../auth.js';
|
||||
import type { ListQuery, UpdateBody, RescaleBody, MoveBody, Meme } from '../types.js';
|
||||
|
||||
@@ -72,8 +73,8 @@ export async function memesRoutes(app: FastifyInstance) {
|
||||
}
|
||||
|
||||
if (q) {
|
||||
conditions.push('(m.title LIKE ? OR m.description LIKE ?)');
|
||||
params.push(`%${q}%`, `%${q}%`);
|
||||
conditions.push('(m.title LIKE ? OR m.description LIKE ? OR m.ocr_text LIKE ?)');
|
||||
params.push(`%${q}%`, `%${q}%`, `%${q}%`);
|
||||
}
|
||||
|
||||
if (conditions.length) {
|
||||
@@ -98,7 +99,7 @@ export async function memesRoutes(app: FastifyInstance) {
|
||||
countParams.push(tag.toLowerCase());
|
||||
}
|
||||
if (collection_id !== undefined) countParams.push(Number(collection_id));
|
||||
if (q) countParams.push(`%${q}%`, `%${q}%`);
|
||||
if (q) countParams.push(`%${q}%`, `%${q}%`, `%${q}%`);
|
||||
|
||||
if (countConditions.length) countSql += ' WHERE ' + countConditions.join(' AND ');
|
||||
|
||||
@@ -159,6 +160,11 @@ export async function memesRoutes(app: FastifyInstance) {
|
||||
|
||||
if (tagsRaw) setMemeTags(id, tagsRaw.split(','));
|
||||
|
||||
// Fire OCR in the background — doesn't block the upload response
|
||||
extractText(filePath, mimeType).then((text) => {
|
||||
if (text) db.prepare('UPDATE memes SET ocr_text = ? WHERE id = ?').run(text, id);
|
||||
});
|
||||
|
||||
return reply.status(201).send(getMemeById(id));
|
||||
});
|
||||
|
||||
|
||||
47
backend/src/services/ocr.ts
Normal file
47
backend/src/services/ocr.ts
Normal file
@@ -0,0 +1,47 @@
|
||||
import tesseract from 'node-tesseract-ocr';
|
||||
import sharp from 'sharp';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { absolutePath } from './storage.js';
|
||||
|
||||
const OCR_CONFIG = {
|
||||
lang: 'eng',
|
||||
oem: 1, // LSTM neural net mode — best accuracy
|
||||
psm: 3, // Fully automatic page segmentation (good for varied meme layouts)
|
||||
};
|
||||
|
||||
export async function extractText(relPath: string, mimeType: string): Promise<string> {
|
||||
const srcAbs = absolutePath(relPath);
|
||||
let inputPath = srcAbs;
|
||||
let tempPath: string | null = null;
|
||||
|
||||
try {
|
||||
// Animated GIFs: extract first frame as PNG for Tesseract (it can't read GIF directly)
|
||||
if (mimeType === 'image/gif') {
|
||||
tempPath = `${srcAbs}.ocr_tmp.png`;
|
||||
await sharp(srcAbs, { animated: false }).png().toFile(tempPath);
|
||||
inputPath = tempPath;
|
||||
}
|
||||
|
||||
const raw = await tesseract.recognize(inputPath, OCR_CONFIG);
|
||||
|
||||
// Clean up: collapse whitespace, strip lines that are pure noise (< 2 chars)
|
||||
const cleaned = raw
|
||||
.split('\n')
|
||||
.map((l) => l.trim())
|
||||
.filter((l) => l.length >= 2)
|
||||
.join(' ')
|
||||
.replace(/\s{2,}/g, ' ')
|
||||
.trim();
|
||||
|
||||
return cleaned;
|
||||
} catch (err) {
|
||||
// OCR failure is non-fatal — image still gets saved, just won't be text-searchable
|
||||
console.warn(`OCR failed for ${relPath}:`, (err as Error).message);
|
||||
return '';
|
||||
} finally {
|
||||
if (tempPath && fs.existsSync(tempPath)) {
|
||||
fs.unlinkSync(tempPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,7 @@ export interface Meme {
|
||||
height: number;
|
||||
parent_id: string | null;
|
||||
collection_id: number | null;
|
||||
ocr_text: string | null;
|
||||
created_at: string;
|
||||
tags: string[];
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user