diff --git a/cueweb/.env.example b/cueweb/.env.example index b4ff813e0..3f7e2e1c7 100644 --- a/cueweb/.env.example +++ b/cueweb/.env.example @@ -1,5 +1,12 @@ NEXT_PUBLIC_OPENCUE_ENDPOINT=http://your-rest-gateway-url.com +# Optional allow-list for the Stuck Frames "Last Line" reader +# (/api/stuck-frames/lastline), as a colon-separated list of absolute path +# prefixes. When set, only .rqlog files under one of these roots are read. +# When unset, reads are not restricted to a root (job log paths are +# site-specific). Set this to harden the deployment. +# CUEWEB_LOG_ROOTS=/mnt/logs:/shows + # Sentry values SENTRY_ENVIRONMENT='development' SENTRY_DSN = sentrydsn diff --git a/cueweb/README.md b/cueweb/README.md index 1fa74e596..d382bec86 100644 --- a/cueweb/README.md +++ b/cueweb/README.md @@ -377,6 +377,7 @@ The current CueWeb system offers a robust set of features designed to enhance us - **Auto-reloading:** Real-time updates for tables. - **Job-finished notifications (two channels):** A per-row **Notify bell** subscribes the browser - a background poller fires a toast (and an optional desktop popup when notification permission is granted) when the job reaches `FINISHED`; subscriptions persist in `localStorage`, sync across tabs, and the notify decision is serialized cross-tab via the Web Locks API so only one tab toasts when several poll the same job. The right-click **Subscribe to Job** menu entry opens a CueGUI-parity dialog that registers an *email* subscriber on Cuebot via the `AddSubscriber` RPC, so Cuebot mails the saved address when the job finishes. The two channels are independent. - **Job dependencies (CueGUI parity):** the job right-click menu groups four entries together. **View Dependencies...** opens a themed dialog mirroring CueGUI's `DependDialog` - a Type / Target / Active / OnJob / OnLayer / OnFrame table backed by the `GetDepends` RPC, with a **Refresh** button. **Dependency Wizard...** is a multi-step state machine covering every CueGUI `depend.DependType` (Job-On-Job / Layer / Frame, Frame By Frame for all layers / Hard Depend, Layer-On-Job / Layer / Frame, Frame By Frame, Frame-On-Job / Layer / Frame, Layer on Simulation Frame); every picker (source layers / frames, target jobs / layers / frames) is multi-select and Done fires the full source x target cross-product in one bulk batch. The Hard Depend variant pairs source/target layers by `layer.type` and fans out one `CreateFrameByFrameDependency` per matched pair across every picked target job. **Drop External Dependencies** and **Drop Internal Dependencies** call the `DropDepends` RPC with `target = EXTERNAL` / `INTERNAL` respectively; on success they dispatch `cueweb:refresh-now` and `cueweb:depends-changed` so the Jobs table re-polls and the Group-By Dependent tree cache rebuilds immediately. +- **Stuck Frames (CueCommander -> Stuck Frame):** a stuck-frame finder at `/stuck-frames`, the CueWeb equivalent of CueGUI's CueCommander Stuck Frame window. Scans every running frame across active jobs and flags the ones whose log has gone silent relative to runtime, grouped under their job, with columns Name / Frame / Host / LLU / Runtime / % Stuck / Average / Last Line and **Auto-refresh** / **Refresh** / **Clear** controls. Detection thresholds run client-side and persist per browser (% of Run Since LLU, Min LLU, % Avg Completion, Total Runtime, Exclude Keywords), with a **+** button to add per-service filter rows so long-running services (e.g. Arnold) can use looser limits than quick ones. Frame right-click actions: Tail / View / View Last Log, Retry / Eat / Kill, Log Stuck Frame (and Log and Retry / Eat / Kill), Frame Not Stuck, Add Job to Excludes / Exclude and Remove Job, **Core Up** (raise the layer's minimum cores), and View Host; job-header actions add View Comments, Job Not Stuck, and Core Up across the job's stuck layers. - **Logs:** View current and previous logs via dropdown. - **Security:** Use JWT-based authorization and secure headers. - **Keyboard shortcuts:** Press `?` anywhere in the app to open a cheat-sheet overlay; the same overlay is also reachable from **Other ▸ Show Shortcuts** in the header or the sidebar. An optional **Notify on Shortcut** toggle (also under Other) fires a toast naming the shortcut that just triggered. See [Keyboard shortcuts](#keyboard-shortcuts) below for the full list. diff --git a/cueweb/app/api/layer/action/setmincores/route.ts b/cueweb/app/api/layer/action/setmincores/route.ts new file mode 100644 index 000000000..5f7694d5b --- /dev/null +++ b/cueweb/app/api/layer/action/setmincores/route.ts @@ -0,0 +1,57 @@ +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { handleRoute } from '@/app/utils/api_utils'; +import { NextRequest, NextResponse } from "next/server"; + +// Set a layer's minimum cores (CueGUI Stuck Frame "Core Up"). Request: +// { layer, cores }. RPC: /job.LayerInterface/SetMinCores. +export async function POST(request: NextRequest) { + const endpoint = "/job.LayerInterface/SetMinCores"; + const method = request.method; + if (method !== 'POST') { + return NextResponse.json({ error: 'Invalid method. Only POST is allowed.' }, { status: 405 }); + } + + let jsonBody: any; + try { + jsonBody = await request.json(); + } catch { + return NextResponse.json({ error: 'Invalid JSON in request body' }, { status: 400 }); + } + // cores is a float proto field (fractional core counts are valid), so reject + // only non-finite (typeof NaN is "number") and negative values, not fractions. + if ( + !jsonBody || + typeof jsonBody !== 'object' || + typeof jsonBody.layer !== 'object' || + jsonBody.layer === null || + typeof jsonBody.layer.id !== 'string' || + jsonBody.layer.id.trim() === '' || + typeof jsonBody.cores !== 'number' || + !Number.isFinite(jsonBody.cores) || + jsonBody.cores < 0 + ) { + return NextResponse.json({ error: 'Invalid request body: layer.id and non-negative numeric cores are required' }, { status: 400 }); + } + + const body = JSON.stringify(jsonBody); + const response = await handleRoute(method, endpoint, body, true); + const responseData = await response.json(); + + if (!response.ok) return NextResponse.json({ error: responseData.error }, { status: response.status }); + return NextResponse.json({ data: responseData.data }, { status: response.status }); +} diff --git a/cueweb/app/api/stuck-frames/lastline/route.ts b/cueweb/app/api/stuck-frames/lastline/route.ts new file mode 100644 index 000000000..bd31c1d45 --- /dev/null +++ b/cueweb/app/api/stuck-frames/lastline/route.ts @@ -0,0 +1,93 @@ +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { NextRequest, NextResponse } from "next/server"; +import { execFile as execFileCallback } from "child_process"; +import { promisify } from "util"; +import { promises as fs } from "fs"; +import path from "path"; + +const execFile = promisify(execFileCallback); + +// Optional per-site allow-list (colon-separated absolute prefixes). When set, +// only .rqlog files under one of these roots are read; when unset, reads aren't +// restricted to a root (job log paths are site-specific). +function allowedLogRoots(): string[] { + return (process.env.CUEWEB_LOG_ROOTS ?? "") + .split(":") + .map((r) => r.trim()) + .filter(Boolean); +} + +// Returns the last non-empty line of a frame's .rqlog (the Stuck Frames +// "Last Line" column, mirroring CueGUI's getLastLine). Best-effort: if the log +// filesystem isn't mounted in this deployment, or the file is missing, it +// returns an empty line rather than erroring. execFile (no shell) + canonical +// path validation (realpath, .rqlog extension, optional root allow-list) keep +// the caller-supplied path from being abused. +export async function GET(request: NextRequest) { + const rawPath = request.nextUrl.searchParams.get("path"); + if (!rawPath || !rawPath.endsWith(".rqlog")) { + return NextResponse.json({ lastLine: "" }, { status: 200 }); + } + + // Canonicalize (follows symlinks) so the extension / root checks apply to the + // real file rather than a lexical path. A missing/unreadable file resolves to + // the best-effort empty response. + let target: string; + try { + target = await fs.realpath(path.resolve(rawPath)); + } catch { + return NextResponse.json({ lastLine: "" }, { status: 200 }); + } + if (!target.endsWith(".rqlog")) { + return NextResponse.json({ lastLine: "" }, { status: 200 }); + } + + const rawRoots = allowedLogRoots(); + if (rawRoots.length > 0) { + const roots = ( + await Promise.all( + rawRoots.map(async (r) => { + try { + return await fs.realpath(path.resolve(r)); + } catch { + return null; + } + }), + ) + ).filter((r): r is string => r !== null); + const inAllowedRoot = roots.some((root) => { + const rel = path.relative(root, target); + return rel === "" || (!rel.startsWith("..") && !path.isAbsolute(rel)); + }); + if (!inAllowedRoot) { + return NextResponse.json({ lastLine: "" }, { status: 200 }); + } + } + + try { + // tail the file, then keep the last non-blank line. + const { stdout } = await execFile("tail", ["-n", "20", "--", target], { + timeout: 5000, + maxBuffer: 1024 * 1024, + }); + const lines = stdout.split("\n").map((l) => l.trimEnd()).filter((l) => l.trim() !== ""); + return NextResponse.json({ lastLine: lines.length ? lines[lines.length - 1] : "" }, { status: 200 }); + } catch { + return NextResponse.json({ lastLine: "" }, { status: 200 }); + } +} diff --git a/cueweb/app/api/stuck-frames/route.ts b/cueweb/app/api/stuck-frames/route.ts new file mode 100644 index 000000000..c6dd3fd80 --- /dev/null +++ b/cueweb/app/api/stuck-frames/route.ts @@ -0,0 +1,163 @@ +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { fetchObjectFromRestGateway } from '@/app/utils/api_utils'; +import { NextRequest, NextResponse } from "next/server"; + +// Server-side data gathering for the Stuck Frames page. CueGUI's +// StuckFramePlugin walks every show's procs; we approximate by listing the +// unfinished jobs (GetJobs) and, per job, fetching its RUNNING frames +// (GetFrames, FrameState 2) and its layers (GetLayers, for the per-service +// average frame time). Each frame is stamped with its job, service and the +// layer's average frame time so the client can apply the full CueGUI +// stuck-detection predicate (LLU / % stuck / avg-completion / runtime) live +// against the user's per-service filter thresholds. +// +// RPCs: /job.JobInterface/GetJobs, /job.JobInterface/GetFrames, +// /job.JobInterface/GetLayers. + +const RUNNING_STATE = 2; // FrameState.RUNNING (proto/src/job.proto) +const MAX_FRAMES_PER_JOB = 1000; +// Cap the per-job GetFrames/GetLayers fan-out. Without a bound, a farm with +// thousands of unfinished jobs would fire that many concurrent gateway calls +// on every 30s poll; this keeps at most N jobs in flight at a time. +const MAX_CONCURRENT_JOBS = 16; + +async function gatewayJson(endpoint: string, body: string): Promise { + try { + const resp = await fetchObjectFromRestGateway(endpoint, "POST", body); + const json = await resp.json(); + if (json?.error) return null; + return json?.data ?? null; + } catch { + return null; + } +} + +// Map over items with a fixed-size worker pool, preserving input order. Plain +// promises (no extra dependency) so the route stays self-contained. +async function mapWithConcurrency( + items: T[], + limit: number, + fn: (item: T, index: number) => Promise, +): Promise { + const results: R[] = new Array(items.length); + let cursor = 0; + async function worker() { + while (true) { + const index = cursor++; + if (index >= items.length) return; + results[index] = await fn(items[index], index); + } + } + const workerCount = Math.min(limit, items.length); + await Promise.all(Array.from({ length: workerCount }, () => worker())); + return results; +} + +// Safety bound on pagination so a malformed/never-shrinking response can't spin +// forever: MAX_FRAME_PAGES * MAX_FRAMES_PER_JOB frames per job. +const MAX_FRAME_PAGES = 50; + +// Fetch every RUNNING frame for a job, paging through GetFrames until a short +// page arrives. A single page caps at MAX_FRAMES_PER_JOB, so a job with more +// running frames than that would otherwise be silently truncated. +async function getRunningFrames(job: any): Promise { + const all: any[] = []; + for (let page = 1; page <= MAX_FRAME_PAGES; page++) { + const framesData = await gatewayJson( + "/job.JobInterface/GetFrames", + JSON.stringify({ + job: { id: job.id, name: job.name }, + req: { + include_finished: false, + page, + limit: MAX_FRAMES_PER_JOB, + states: { frame_states: [RUNNING_STATE] }, + }, + }), + ); + const batch: any[] = Array.isArray(framesData?.frames?.frames) ? framesData.frames.frames : []; + all.push(...batch); + // A short (or empty/failed) page means we've reached the end. + if (batch.length < MAX_FRAMES_PER_JOB) break; + } + return all; +} + +export async function POST(_request: NextRequest) { + try { + const jobsData = await gatewayJson( + "/job.JobInterface/GetJobs", + JSON.stringify({ r: { include_finished: false } }), + ); + if (jobsData === null) { + return NextResponse.json({ error: "Failed to list jobs" }, { status: 500 }); + } + const jobs: any[] = Array.isArray(jobsData?.jobs?.jobs) ? jobsData.jobs.jobs : []; + + const perJob = await mapWithConcurrency( + jobs, + MAX_CONCURRENT_JOBS, + async (job) => { + const [frames, layersData] = await Promise.all([ + getRunningFrames(job), + gatewayJson( + "/job.JobInterface/GetLayers", + JSON.stringify({ job: { id: job.id, name: job.name } }), + ), + ]); + + const layers: any[] = Array.isArray(layersData?.layers?.layers) ? layersData.layers.layers : []; + // layerName -> details for attaching to each frame (service + average + // frame time for detection; id + minCores for the Core Up action). + const layerInfo = new Map< + string, + { id: string; service: string; avgFrameSec: number; minCores: number } + >(); + for (const layer of layers) { + layerInfo.set(layer.name, { + id: layer.id ?? "", + service: Array.isArray(layer.services) && layer.services.length ? layer.services[0] : "", + avgFrameSec: Number(layer.layerStats?.avgFrameSec ?? 0), + minCores: Number(layer.minCores ?? 0), + }); + } + + return frames + .filter((f) => f.state === "RUNNING") + .map((f) => { + const info = layerInfo.get(f.layerName); + return { + ...f, + jobId: job.id, + jobName: job.name, + jobLogDir: job.logDir ?? "", + jobHasComment: !!job.hasComment, + service: info?.service ?? "", + avgFrameSec: info?.avgFrameSec ?? 0, + layerId: info?.id ?? "", + layerMinCores: info?.minCores ?? 0, + }; + }); + }, + ); + + return NextResponse.json({ data: perJob.flat() }, { status: 200 }); + } catch (error) { + return NextResponse.json({ error: (error as Error).message }, { status: 500 }); + } +} diff --git a/cueweb/app/stuck-frames/page.tsx b/cueweb/app/stuck-frames/page.tsx new file mode 100644 index 000000000..467147b1d --- /dev/null +++ b/cueweb/app/stuck-frames/page.tsx @@ -0,0 +1,612 @@ +"use client"; + +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import * as React from "react"; +import { useSession } from "next-auth/react"; +import { useRouter } from "next/navigation"; +import { MessageSquare } from "lucide-react"; + +import type { Frame } from "@/app/frames/frame-columns"; +import { StuckFrame, getStuckFrames, getStuckFrameLastLine } from "@/app/utils/get_utils"; +import { eatFrames, killFrames, retryFrames, setLayerMinCores } from "@/app/utils/action_utils"; +import { handleError, toastSuccess } from "@/app/utils/notify_utils"; +import { Button } from "@/components/ui/button"; +import { Checkbox } from "@/components/ui/checkbox"; +import { + Dialog, + DialogContent, + DialogFooter, + DialogHeader, + DialogTitle, +} from "@/components/ui/dialog"; +import { Input } from "@/components/ui/input"; +import { Skeleton } from "@/components/ui/skeleton"; +import { + DEFAULT_FILTER, + StuckFrameFilters, + type StuckFilter, +} from "@/components/ui/stuck-frame-filters"; + +const AUTO_REFRESH_MS = 60000; +const FILTERS_KEY = "cueweb.stuck-frames.filters"; + +// --- formatting ----------------------------------------------------------- +function fmtDur(seconds: number): string { + if (!Number.isFinite(seconds) || seconds <= 0) return ""; + const h = Math.floor(seconds / 3600); + const m = Math.floor((seconds % 3600) / 60); + const s = Math.floor(seconds % 60); + return `${h}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}`; +} +const hostOf = (lastResource: string) => (lastResource || "").split("/")[0] || ""; + +// --- detection (CueGUI StuckFramePlugin parity) --------------------------- +type Metrics = { runtime: number; llu: number; percentStuck: number; avg: number }; + +function metricsOf(f: StuckFrame, now: number): Metrics { + const runtime = f.startTime ? now - f.startTime : 0; + const llu = f.state === "RUNNING" && f.lluTime ? now - f.lluTime : 0; + const percentStuck = runtime > 0 ? llu / runtime : 0; + return { runtime, llu, percentStuck, avg: f.avgFrameSec }; +} + +// The catch-all filter (index 0) applies unless a later, service-specific +// filter matches the frame's service. +function pickFilter(f: StuckFrame, filters: StuckFilter[]): StuckFilter | undefined { + const specific = filters.find((flt, i) => i > 0 && flt.service && flt.service === f.service); + return specific ?? filters[0]; +} + +function isExcluded(f: StuckFrame, filter: StuckFilter): boolean { + const keywords = filter.regex.split(",").map((s) => s.trim()).filter(Boolean); + return keywords.some((kw) => { + try { + const re = new RegExp(kw, "i"); + return re.test(f.jobName) || re.test(f.layerName); + } catch { + const k = kw.toLowerCase(); + return f.jobName.toLowerCase().includes(k) || f.layerName.toLowerCase().includes(k); + } + }); +} + +// Mirrors CueGUI: lluTime > minLLU AND %stuck > threshold AND runtime > +// avg*avgComp% AND %stuck < 1.1 AND runtime > 500s. +function isStuck(f: StuckFrame, filter: StuckFilter | undefined, now: number): boolean { + if (!filter || !filter.enabled) return false; + if (isExcluded(f, filter)) return false; + const { runtime, llu, percentStuck, avg } = metricsOf(f, now); + return ( + llu > filter.minLlu * 60 && + percentStuck * 100 > filter.percentStuck && + runtime > (avg * filter.avgComp) / 100 && + percentStuck < 1.1 && + runtime > 500 + ); +} + +type MenuState = + | { kind: "frame"; x: number; y: number; frame: StuckFrame } + | { kind: "job"; x: number; y: number; jobId: string; jobName: string }; + +export default function StuckFramesPage() { + const router = useRouter(); + const { data: session } = useSession(); + const username = session?.user?.name ?? session?.user?.email ?? "cueweb"; + + const [raw, setRaw] = React.useState(null); + const [now, setNow] = React.useState(() => Date.now() / 1000); + const [filters, setFilters] = React.useState([{ ...DEFAULT_FILTER }]); + const [autoRefresh, setAutoRefresh] = React.useState(false); + const [notify, setNotify] = React.useState(false); + const [loading, setLoading] = React.useState(false); + // Set when a load fails, so a failed scan isn't rendered as "no stuck frames". + const [loadError, setLoadError] = React.useState(null); + + // Client-side removals: "Frame/Job Not Stuck". + const [hiddenFrames, setHiddenFrames] = React.useState>(new Set()); + const [hiddenJobs, setHiddenJobs] = React.useState>(new Set()); + + const [lastLines, setLastLines] = React.useState>({}); + const [menu, setMenu] = React.useState(null); + const [coreUp, setCoreUp] = React.useState<{ targets: { id: string; name: string }[]; cores: string } | null>(null); + const [busyId, setBusyId] = React.useState(null); + + // Restore persisted filters on mount. + React.useEffect(() => { + const stored = window.localStorage.getItem(FILTERS_KEY); + if (stored) { + try { + const parsed = JSON.parse(stored); + if (Array.isArray(parsed) && parsed.length > 0) setFilters(parsed); + } catch { + /* ignore corrupt value */ + } + } + }, []); + + function persistFilters(next: StuckFilter[]) { + setFilters(next); + window.localStorage.setItem(FILTERS_KEY, JSON.stringify(next)); + } + + // Returns the loaded frames (null on cancel/error) so callers can act on the + // fresh data without waiting for the state/memo round-trip. + const load = React.useCallback(async (isCancelled?: () => boolean): Promise => { + setLoading(true); + try { + const data = await getStuckFrames(); + if (isCancelled?.()) return null; + setLoadError(null); + setRaw(data); + setNow(Date.now() / 1000); + return data; + } catch (err) { + if (isCancelled?.()) return null; + handleError(err, "Could not load stuck frames"); + // Keep the last good rows on a failed poll, but record the error so a + // failed scan with no data shows an error rather than the empty state. + setLoadError("Could not load stuck frames."); + setRaw((prev) => prev ?? []); + return null; + } finally { + if (!isCancelled?.()) setLoading(false); + } + }, []); + + React.useEffect(() => { + let cancelled = false; + load(() => cancelled); + return () => { + cancelled = true; + }; + }, [load]); + + // Auto-refresh. CueGUI refreshes ~every 30 min; a web monitor wants fresher + // data, so this polls every 60s while enabled. Fires a desktop notification + // on completion when armed and stuck frames are actually present. + React.useEffect(() => { + if (!autoRefresh) return; + let cancelled = false; + // Skip a tick if the previous scan is still running, so a slow/degraded + // backend can't pile up overlapping, out-of-order refreshes. + let inFlight = false; + const id = setInterval(async () => { + if (inFlight) return; + inFlight = true; + try { + const data = await load(() => cancelled); + if (cancelled || !data) return; + if (notify && typeof Notification !== "undefined" && Notification.permission === "granted") { + // Apply the same detection + hidden filters as the table so we only + // notify when a stuck frame would actually be shown. + const scanNow = Date.now() / 1000; + const stuckCount = data.filter( + (f) => + !hiddenFrames.has(f.id) && + !hiddenJobs.has(f.jobId) && + isStuck(f, pickFilter(f, filters), scanNow), + ).length; + if (stuckCount > 0) { + new Notification(`CueWeb: ${stuckCount} stuck frame(s) detected`); + } + } + } finally { + inFlight = false; + } + }, AUTO_REFRESH_MS); + return () => { + cancelled = true; + clearInterval(id); + }; + }, [autoRefresh, notify, load, filters, hiddenFrames, hiddenJobs]); + + function toggleNotify(checked: boolean) { + setNotify(checked); + if (checked && typeof Notification !== "undefined" && Notification.permission === "default") { + Notification.requestPermission(); + } + } + + // Services present in the data, for the add-filter service dropdown. + const availableServices = React.useMemo(() => { + const set = new Set(); + (raw ?? []).forEach((f) => f.service && set.add(f.service)); + return Array.from(set).sort(); + }, [raw]); + + // Apply detection + client-side removals, group by job. Identity is jobId, + // not jobName, so two jobs sharing a name aren't merged or acted on together. + const groups = React.useMemo(() => { + if (!raw) return null; + const stuck = raw.filter( + (f) => + !hiddenFrames.has(f.id) && + !hiddenJobs.has(f.jobId) && + isStuck(f, pickFilter(f, filters), now), + ); + const byJob = new Map(); + for (const f of stuck) { + const entry = byJob.get(f.jobId) ?? { jobName: f.jobName, frames: [] }; + entry.frames.push(f); + byJob.set(f.jobId, entry); + } + return Array.from(byJob.entries()) + .sort((a, b) => a[1].jobName.localeCompare(b[1].jobName)) + .map(([jobId, { jobName, frames }]) => ({ + jobId, + jobName, + frames: frames.sort((a, b) => metricsOf(b, now).runtime - metricsOf(a, now).runtime), + })); + }, [raw, filters, now, hiddenFrames, hiddenJobs]); + + const totalStuck = groups?.reduce((n, g) => n + g.frames.length, 0) ?? 0; + + // Lazily fetch the last log line for visible stuck frames. + React.useEffect(() => { + if (!groups) return; + const pending = groups + .flatMap((g) => g.frames) + .filter((f) => f.jobLogDir && lastLines[f.id] === undefined) + .slice(0, 50); // bound per pass + if (pending.length === 0) return; + let cancelled = false; + (async () => { + const entries = await Promise.all( + pending.map(async (f) => { + const logPath = `${f.jobLogDir}/${f.jobName}.${f.name}.rqlog`; + const line = await getStuckFrameLastLine(logPath); + return [f.id, line] as const; + }), + ); + if (cancelled) return; + setLastLines((prev) => { + const next = { ...prev }; + for (const [id, line] of entries) next[id] = line; + return next; + }); + })(); + return () => { + cancelled = true; + }; + }, [groups, lastLines]); + + // Close the context menu on any outside interaction. + React.useEffect(() => { + if (!menu) return; + const close = () => setMenu(null); + const onKey = (e: KeyboardEvent) => e.key === "Escape" && setMenu(null); + window.addEventListener("click", close); + window.addEventListener("scroll", close, true); + window.addEventListener("keydown", onKey); + return () => { + window.removeEventListener("click", close); + window.removeEventListener("scroll", close, true); + window.removeEventListener("keydown", onKey); + }; + }, [menu]); + + // --- helpers ------------------------------------------------------------- + function toFrame(sf: StuckFrame): Frame { + const { + jobId: _a, jobName: _b, jobLogDir: _c, jobHasComment: _d, + service: _e, avgFrameSec: _f, layerId: _g, layerMinCores: _h, + ...frame + } = sf; + return frame as Frame; + } + + function openLog(f: StuckFrame) { + const logDir = `${f.jobLogDir}/${f.jobName}.${f.name}.rqlog`; + const params = new URLSearchParams({ frameId: f.id, frameLogDir: logDir, username }); + window.open(`/frames/${encodeURIComponent(f.name)}?${params.toString()}`, "_blank", "noopener,noreferrer"); + } + + function exportLog(frames: StuckFrame[]) { + // Web adaptation of CueGUI's YAML "stuck_frames_db" file: a JSON download + // (the browser can't write to a fileshare). + // Key by jobId (not jobName) so distinct jobs sharing a name aren't merged + // into one bucket; carry jobName for readability. + const db: Record }> = {}; + for (const f of frames) { + const { runtime, llu, avg } = metricsOf(f, now); + const byJob = db[f.jobId] ?? (db[f.jobId] = { jobName: f.jobName, frames: {} }); + byJob.frames[`${f.number}-${Math.floor(now)}`] = { + layer: f.layerName, + host: f.lastResource, + llu, + runtime, + average: avg, + log: lastLines[f.id] ?? "", + }; + } + const blob = new Blob([JSON.stringify(db, null, 2)], { type: "application/json" }); + const url = URL.createObjectURL(blob); + const a = document.createElement("a"); + a.href = url; + a.download = "stuck_frames.json"; + a.click(); + URL.revokeObjectURL(url); + toastSuccess(`Logged ${frames.length} stuck frame(s)`); + } + + function hideFrame(f: StuckFrame) { + setHiddenFrames((prev) => new Set(prev).add(f.id)); + } + function hideJob(jobId: string) { + setHiddenJobs((prev) => new Set(prev).add(jobId)); + } + function addJobToExcludes(jobName: string) { + // Append the job name to the catch-all filter's exclude keywords. + persistFilters( + filters.map((flt, i) => + i === 0 + ? { ...flt, regex: flt.regex ? `${flt.regex}, ${jobName}` : jobName } + : flt, + ), + ); + toastSuccess(`Excluded ${jobName}`); + } + + async function act(f: StuckFrame, fn: () => Promise) { + setBusyId(f.id); + setMenu(null); + try { + // Only remove the frame from view when the backend action succeeded; + // performAction resolves false (without throwing) on failure. + const ok = await fn(); + if (ok) { + hideFrame(f); + await load(); + } + } finally { + setBusyId(null); + } + } + + const retry = (f: StuckFrame) => act(f, () => retryFrames([toFrame(f)])); + const eat = (f: StuckFrame) => act(f, () => eatFrames([toFrame(f)])); + const kill = (f: StuckFrame) => + act(f, () => killFrames([toFrame(f)], username, `Manual frame kill from CueWeb Stuck Frames by ${username}`)); + + function openCoreUpForFrame(f: StuckFrame) { + setMenu(null); + if (!f.layerId) return; + setCoreUp({ targets: [{ id: f.layerId, name: f.layerName }], cores: String(Math.max(1, f.layerMinCores || 1)) }); + } + function openCoreUpForJob(jobId: string) { + setMenu(null); + const frames = (raw ?? []).filter((f) => f.jobId === jobId && f.layerId); + const seen = new Map(); + frames.forEach((f) => seen.set(f.layerId, f.layerName)); + if (seen.size === 0) return; + setCoreUp({ targets: Array.from(seen.entries()).map(([id, name]) => ({ id, name })), cores: "1" }); + } + async function applyCoreUp() { + if (!coreUp) return; + const cores = Number(coreUp.cores); + if (!Number.isFinite(cores) || cores < 0) return; + // Only close + refresh if every layer update succeeded (each helper toasts + // its own failure); otherwise keep the dialog open for retry. + const results = await Promise.all(coreUp.targets.map((t) => setLayerMinCores(t, cores))); + if (!results.every(Boolean)) return; + setCoreUp(null); + await load(); + } + + // --- render -------------------------------------------------------------- + const menuItemCls = "block w-full rounded px-2 py-1.5 text-left hover:bg-accent disabled:opacity-50"; + + return ( +
+
+

Stuck Frames

+
+ + + + +
+
+ +
+ +
+ + {groups === null ? ( +
+ + + +
+ ) : totalStuck === 0 ? ( + loadError ? ( +

{loadError}

+ ) : ( +

+ No stuck frames detected with the current filters. +

+ ) + ) : ( +
+ + + + + + + + + + + + + + + + {groups.map((g) => ( + + { + e.preventDefault(); + setMenu({ kind: "job", x: e.clientX, y: e.clientY, jobId: g.jobId, jobName: g.jobName }); + }} + > + + + + {g.frames.map((f) => { + const m = metricsOf(f, now); + return ( + { + e.preventDefault(); + setMenu({ kind: "frame", x: e.clientX, y: e.clientY, frame: f }); + }} + > + + + + + + + + + + ); + })} + + ))} + +
Name{/* comment icon col */}FrameHostLLURuntime% StuckAverageLast Line
{g.jobName} + {g.frames[0]?.jobHasComment ? ( + + ) : null} + +
{f.layerName} + {f.number}{hostOf(f.lastResource)}{fmtDur(m.llu)}{fmtDur(m.runtime)}{(m.percentStuck * 100).toFixed(2)}{fmtDur(m.avg)} + {lastLines[f.id] ?? ""} + {busyId === f.id ? " …" : ""} +
+
+ )} + + {!loading && groups !== null ? ( +

+ {totalStuck} stuck frame(s) across {groups.length} job(s). +

+ ) : null} + + {/* Context menu */} + {menu ? ( +
e.stopPropagation()} + > + {menu.kind === "frame" ? ( + <> + + + {menu.frame.retryCount >= 1 ? ( + + ) : null} +
+ + + +
+ + + + +
+ + + +
+ + + + ) : ( + <> + +
+ + + +
+ + + )} +
+ ) : null} + + {/* Core Up dialog */} + !o && setCoreUp(null)}> + + + Core Up + +
+

+ Set minimum cores for {coreUp?.targets.length === 1 ? `layer "${coreUp.targets[0].name}"` : `${coreUp?.targets.length ?? 0} layer(s)`}. +

+ setCoreUp((c) => (c ? { ...c, cores: e.target.value } : c))} + aria-label="Minimum cores" + /> +
+ + + + +
+
+
+ ); +} diff --git a/cueweb/app/utils/action_utils.ts b/cueweb/app/utils/action_utils.ts index c5bb3ca38..19ab3b0c3 100644 --- a/cueweb/app/utils/action_utils.ts +++ b/cueweb/app/utils/action_utils.ts @@ -100,14 +100,14 @@ export async function killLayers(layers: Layer[], username: string, reason: stri await performAction(endpoint, bodyAr, `Killed ${layers.length} layer(s)`); } -export async function killFrames(frames: Frame[], username: string, reason: string) { +export async function killFrames(frames: Frame[], username: string, reason: string): Promise { const endpoint = "/api/frame/action/kill"; const bodyAr = frames.map(frame => JSON.stringify({ frame, username, reason })); - await performAction(endpoint, bodyAr, `Killed ${frames.length} frame(s)`); + return performAction(endpoint, bodyAr, `Killed ${frames.length} frame(s)`); } @@ -132,12 +132,12 @@ export async function eatLayersFrames(layers: Layer[]) { await performAction(endpoint, bodyAr, `Ate ${layers.length} layer(s)`); } -export async function eatFrames(frames: Frame[]) { +export async function eatFrames(frames: Frame[]): Promise { const endpoint = "/api/frame/action/eat"; const bodyAr = frames.map(frame => JSON.stringify({ frame })); - await performAction(endpoint, bodyAr, `Ate ${frames.length} frame(s)`); + return performAction(endpoint, bodyAr, `Ate ${frames.length} frame(s)`); } @@ -183,12 +183,19 @@ export async function retryLayersDeadFrames(layers: Layer[]) { } } -export async function retryFrames(frames: Frame[]) { +export async function retryFrames(frames: Frame[]): Promise { const endpoint = "/api/frame/action/retry"; const bodyAr = frames.map(frame => JSON.stringify({ frame })); - await performAction(endpoint, bodyAr, `Retried ${frames.length} frame(s)`); + return performAction(endpoint, bodyAr, `Retried ${frames.length} frame(s)`); +} + +// Set a layer's minimum cores (CueGUI Stuck Frame "Core Up"). cores is a float +// core count. Returns success so callers can gate a refresh. +export async function setLayerMinCores(layer: { id: string; name?: string }, cores: number): Promise { + const endpoint = "/api/layer/action/setmincores"; + return performAction(endpoint, [JSON.stringify({ layer, cores })], `Set min cores to ${cores}`); } /**************************************/ diff --git a/cueweb/app/utils/get_utils.ts b/cueweb/app/utils/get_utils.ts index 99f570f75..271892da6 100644 --- a/cueweb/app/utils/get_utils.ts +++ b/cueweb/app/utils/get_utils.ts @@ -233,6 +233,47 @@ export async function getFrames(body: string): Promise { return response ? response : []; } +// A running frame plus the job/layer context the Stuck Frames page needs to +// apply CueGUI's per-service stuck-detection predicate (service + average +// frame time) and to act on the row (job, log dir, comment flag). +export type StuckFrame = Frame & { + jobId: string; + jobName: string; + jobLogDir: string; + jobHasComment: boolean; + service: string; + avgFrameSec: number; + layerId: string; + layerMinCores: number; +}; + +// Fetch every RUNNING frame across all unfinished jobs (server-aggregated via +// /api/stuck-frames), each stamped with its service and average frame time. +// The Stuck Frames page applies the detection thresholds locally so the +// filters stay instant. +export async function getStuckFrames(): Promise { + const ENDPOINT = "/api/stuck-frames"; + const response = await accessGetApi(ENDPOINT, JSON.stringify({})); + if (!Array.isArray(response)) { + throw new Error("Failed to load stuck frames from Cuebot."); + } + return response; +} + +// Best-effort fetch of a frame log's last line (the "Last Line" column). Empty +// when the log filesystem isn't reachable from the web server. +export async function getStuckFrameLastLine(logPath: string): Promise { + if (!logPath) return ""; + const base = process.env.NEXT_PUBLIC_URL ?? ""; + try { + const resp = await fetch(`${base}/api/stuck-frames/lastline?path=${encodeURIComponent(logPath)}`); + const json = await resp.json(); + return typeof json?.lastLine === "string" ? json.lastLine : ""; + } catch { + return ""; + } +} + // Fetch a pending job based on the request body export async function getPendingJob(body: string): Promise { const ENDPOINT = "/api/job/getjob"; diff --git a/cueweb/components/ui/stuck-frame-filters.tsx b/cueweb/components/ui/stuck-frame-filters.tsx new file mode 100644 index 000000000..648b5ba1c --- /dev/null +++ b/cueweb/components/ui/stuck-frame-filters.tsx @@ -0,0 +1,203 @@ +"use client"; + +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import * as React from "react"; +import { Plus, X } from "lucide-react"; + +import { Button } from "@/components/ui/button"; +import { Checkbox } from "@/components/ui/checkbox"; +import { Input } from "@/components/ui/input"; + +// One detection filter (CueGUI StuckFrameBar). service === "" is the catch-all +// ("All" when it's the only filter, "All Other Types" when service filters +// exist). The four thresholds mirror CueGUI's spinboxes. +export type StuckFilter = { + service: string; + regex: string; // exclude keywords, comma-separated + percentStuck: number; // % of runtime since last log update + minLlu: number; // minutes + avgComp: number; // % of average completion time + runtime: number; // minutes + enabled: boolean; +}; + +// CueGUI defaults: [percentStuck, minLlu, avgComp, runtime]. +export const SERVICE_DEFAULTS: Record = { + preprocess: [1, 1, 115, 10], + nuke: [50, 5, 115, 10], + arnold: [50, 60, 115, 120], +}; + +export const DEFAULT_FILTER: StuckFilter = { + service: "", + regex: "", + percentStuck: 50, + minLlu: 30, + avgComp: 115, + runtime: 60, + enabled: true, +}; + +export function makeServiceFilter(service: string): StuckFilter { + const d = SERVICE_DEFAULTS[service]; + return d + ? { service, regex: "", percentStuck: d[0], minLlu: d[1], avgComp: d[2], runtime: d[3], enabled: true } + : { ...DEFAULT_FILTER, service }; +} + +const NUM = "h-8 w-20 text-right"; + +function NumberField({ + label, + suffix, + value, + disabled, + onChange, +}: { + label: string; + suffix: string; + value: number; + disabled: boolean; + onChange: (n: number) => void; +}) { + return ( + + ); +} + +export function StuckFrameFilters({ + filters, + onChange, + availableServices, +}: { + filters: StuckFilter[]; + onChange: (filters: StuckFilter[]) => void; + availableServices: string[]; +}) { + function update(index: number, patch: Partial) { + onChange(filters.map((f, i) => (i === index ? { ...f, ...patch } : f))); + } + function addFilter() { + // Default the new filter to the first available service not already used. + // Bail out if every service is taken, rather than adding an empty row. + const used = new Set(filters.map((f) => f.service).filter(Boolean)); + const next = availableServices.find((s) => !used.has(s)); + if (!next) return; + onChange([...filters, makeServiceFilter(next)]); + } + function removeFilter(index: number) { + onChange(filters.filter((_, i) => i !== index)); + } + + const hasServiceFilters = filters.some((f, i) => i > 0); + const usedServices = new Set(filters.map((f) => f.service).filter(Boolean)); + const canAddFilter = availableServices.some((s) => !usedServices.has(s)); + + return ( +
+ {filters.map((f, i) => { + const isCatchAll = i === 0; + const disabled = !f.enabled; + return ( +
+
+ Layer Service + {isCatchAll ? ( + + {hasServiceFilters ? "All Other Types" : "All"} + + ) : ( + + )} +
+ + update(i, { percentStuck: n })} /> + update(i, { minLlu: n })} /> + update(i, { avgComp: n })} /> + update(i, { runtime: n })} /> + + + + + + {isCatchAll ? ( + + ) : ( + + )} +
+ ); + })} +
+ ); +} diff --git a/docs/_docs/concepts/glossary.md b/docs/_docs/concepts/glossary.md index 07e48c3b6..9301ee80e 100644 --- a/docs/_docs/concepts/glossary.md +++ b/docs/_docs/concepts/glossary.md @@ -114,6 +114,13 @@ A machine that is running an instance of *rqd*. This machine will split up into A job is a collection of *layers*, which is sent as a *script* to the queue to be processed on remote *cores*. +## Last Log Update (LLU) + +The time elapsed since a running *frame* last wrote to its log. A large LLU on a +frame that is still running is the main signal that the frame may be *stuck* - +the process is alive but no longer making progress. CueWeb's Stuck Frames page +shows LLU per frame and uses it (relative to runtime) to flag stuck frames. + ## Layers The sub-jobs in an *outline script* job. Each layer contains a frame range and a @@ -203,6 +210,15 @@ occasional reruns or for redirecting legacy content to training allocations. When all the *frames* of the first *job* need to finish before the second job can begin. +## Stuck frame + +A running *frame* that appears hung: it keeps running but has stopped writing to +its log, so its *Last Log Update (LLU)* keeps climbing relative to its runtime. +A stuck frame is not a distinct frame state - it is detected heuristically (LLU +vs. runtime vs. the *layer*'s average frame time). CueGUI's CueCommander Stuck +Frame plugin and CueWeb's Stuck Frames page list them so you can retry, eat, +kill, or *core up* (raise the minimum cores of) the affected layer. + ## Subscription A Subscription is an object that associates multiple *allocations* with a diff --git a/docs/_docs/developer-guide/cueweb-development.md b/docs/_docs/developer-guide/cueweb-development.md index b991fdd1a..0df11a699 100644 --- a/docs/_docs/developer-guide/cueweb-development.md +++ b/docs/_docs/developer-guide/cueweb-development.md @@ -1168,6 +1168,73 @@ rewrites Cuebot's duplicate-key error into a short user-facing message. --- +## Stuck Frames page (CueCommander parity) + +The `/stuck-frames` page (`app/stuck-frames/page.tsx`) replicates the CueGUI +CueCommander Stuck Frame plugin. Unlike the other tables it renders its own +job-grouped layout (not `SimpleDataTable`), because rows are grouped under a job +header and the detection runs client-side. Files involved: + +```text +app/api/stuck-frames/route.ts # aggregate every RUNNING frame across unfinished jobs +app/api/stuck-frames/lastline/route.ts # tail a frame's .rqlog for the "Last Line" column +app/stuck-frames/page.tsx # page + detection helpers (metricsOf/pickFilter/isExcluded/isStuck) +components/ui/stuck-frame-filters.tsx # StuckFilter type, DEFAULT_FILTER, SERVICE_DEFAULTS, makeServiceFilter, StuckFrameFilters +app/utils/get_utils.ts # StuckFrame type, getStuckFrames(), getStuckFrameLastLine() +app/utils/action_utils.ts # retryFrames/eatFrames/killFrames, setLayerMinCores (Core Up) +``` + +### Data source + +`getStuckFrames()` → `/api/stuck-frames` aggregates every RUNNING frame +across unfinished jobs server-side, stamping each with its `service`, +`avgFrameSec`, `layerId`, and `layerMinCores` (the `StuckFrame` type extends +`Frame`). The page polls it on a timer (Auto-refresh). Per visible frame, +`getStuckFrameLastLine()` → `/api/stuck-frames/lastline` fills the **Last +Line** column; that route canonicalizes the path with `realpath`, enforces the +optional `CUEWEB_LOG_ROOTS` allow-list, and `tail`s the `.rqlog` via `execFile` +(no shell) - best-effort, returning an empty line when the log FS isn't mounted. + +### Detection (client-side) + +The detection lives in `page.tsx` so the filters stay instant. `metricsOf(f)` +derives `runtime = now - startTime`, `llu = now - lluTime` (RUNNING only) and +`percentStuck = llu / runtime`. `pickFilter` selects the most specific filter +for a frame (a service row whose `service` matches, else the catch-all at index +0). `isExcluded` runs the filter's comma-separated `regex` keywords against the +job/layer name. `isStuck` mirrors CueGUI: `llu > minLlu*60` **and** +`percentStuck*100 > filter.percentStuck` threshold **and** `runtime > avg*avgComp/100` +**and** `percentStuck < 1.1` **and** `runtime > 500`. The `percentStuck < 1.1` +term is a CueGUI-parity sanity bound, not a maximum-stuck filter: `llu` normally +cannot exceed `runtime`, so the ratio stays in `[0, 1]`, but a stale log +timestamp, a reused log path on retry, or clock skew between the log filesystem +and the server can push it slightly above `1.0` - the bound discards those +implausible readings rather than flagging them as stuck. + +### Filters + +`stuck-frame-filters.tsx` owns the `StuckFilter` shape and the +`StuckFrameFilters` bar. Filter 0 is the catch-all (`service: ""`); the **+** +button appends a `makeServiceFilter(service)` row for the first +not-yet-used service from the page-supplied `availableServices`, seeded from +`SERVICE_DEFAULTS` (`preprocess`/`nuke`/`arnold`) or `DEFAULT_FILTER` otherwise. +The full filter list persists to `localStorage["cueweb.stuck-frames.filters"]` +(`FILTERS_KEY`). + +### Actions + +Frame/job context menus are rendered inline in `page.tsx` (not the shared +`action-context-menu.tsx`). Retry/Eat/Kill call `retryFrames` / `eatFrames` / +`killFrames` (`/api/frame/action/{retry,eat,kill}`). **Core Up** opens a small +dialog and calls `setLayerMinCores()` → `/api/layer/action/setmincores` +(one call per target layer; the job variant fans out across the job's stuck +layers). **Log Stuck Frame** / **Log and Retry/Eat/Kill** run a client-side +`exportLog(...)` before the action. **Frame/Job Not Stuck** and the **Exclude** +entries are client-only: the former hide ids in component state (cleared by +**Clear**), the latter append to the active filter's exclude keywords. + +--- + ## Facility Service Defaults (CueCommander parity) The `/services` page replicates the CueGUI CueCommander Facility Service Defaults diff --git a/docs/_docs/getting-started/deploying-cueweb.md b/docs/_docs/getting-started/deploying-cueweb.md index 2b5ebcdb0..c1d86bbe2 100644 --- a/docs/_docs/getting-started/deploying-cueweb.md +++ b/docs/_docs/getting-started/deploying-cueweb.md @@ -511,6 +511,38 @@ The form auto-saves a draft to `localStorage` on every keystroke and keeps per-f --- +## Stuck Frames page (log access) + +The `/stuck-frames` route (CueCommander → Stuck Frame) finds running frames that have stopped writing to their logs. It ships with CueWeb and needs no extra services - it reads running frames through the same REST gateway + cuebot path as the rest of the app. The one deployment-specific concern is **frame-log access**, which powers the page's **Last Line** column and the Tail/View Log actions. + +**Mount the render log directory into the CueWeb container, read-only.** CueWeb's server reads frame logs from its own filesystem, so the directory where RQD writes logs (the sandbox uses `/tmp/rqd/logs`, matching cuebot's `CUE_FRAME_LOG_DIR`) must be visible to the CueWeb container at the same path: + +```yaml +# docker-compose.yml (cueweb service) +volumes: + - /tmp/rqd/logs:/tmp/rqd/logs:ro +``` + +```yaml +# Kubernetes: mount the shared logs volume into the cueweb pod, e.g. +volumeMounts: + - name: frame-logs + mountPath: /net/render/logs + readOnly: true +``` + +If the log directory is not mounted, the page still lists stuck frames, but the **Last Line** column stays empty and the in-app log actions can't read the file. + +**Optionally restrict which roots are readable** with `CUEWEB_LOG_ROOTS` - a colon-separated list of absolute path prefixes. When set, the log-reading routes (`/api/stuck-frames/lastline` and the log download) only serve files under one of those roots; when unset, reads are not restricted to a root. Scope it to the mounted log dir: + +```bash +CUEWEB_LOG_ROOTS=/net/render/logs +``` + +**Using the page**: open **CueCommander → Stuck Frame**, tune the filter bar (Min LLU, % of Run Since LLU, Total Runtime) or add a per-service filter with **+**, then right-click a frame for Retry / Eat / Kill / View Log / **Core Up**. See the [CueWeb User Guide](/docs/user-guides/cueweb-user-guide/#stuck-frames) for the full walkthrough. + +--- + ## Reverse Proxy Configuration ### Nginx Configuration diff --git a/docs/_docs/other-guides/cueweb.md b/docs/_docs/other-guides/cueweb.md index 64c160fb1..c75d30112 100644 --- a/docs/_docs/other-guides/cueweb.md +++ b/docs/_docs/other-guides/cueweb.md @@ -173,22 +173,29 @@ CueWeb replicates the core functionality of [CueGUI](https://www.opencue.io/docs - **Create Show** dialog: enter a unique alphanumeric name and optionally subscribe the new show to one or more allocations (checkbox + Size + Burst per allocation). - **Show actions** via the row's right-click menu: **Show Properties** (a four-tab dialog - Settings with default max/min cores and comment email, Booking with enable booking / enable dispatch, read-only Statistics, and Raw Show Data) and **Create Subscription...** (subscribe a show to an allocation with Size and Burst). -31. **Facility Service Defaults (CueCommander → Services):** +31. **Stuck Frames (CueCommander → Stuck Frame):** + - A stuck-frame finder at `/stuck-frames`, the CueWeb equivalent of CueGUI's CueCommander Stuck Frame window. Reached from the CueCommander menu / sidebar entry. + - Scans every running frame across active jobs and flags the ones that look hung (the log has gone silent relative to runtime), grouped under their job. Columns: Name, Frame, Host, LLU, Runtime, % Stuck, Average, Last Line. Auto-refreshes on a timer, with **Refresh** / **Clear** controls. + - **Detection filters** (saved per browser): % of Run Since LLU, Min LLU, % Avg Completion, Total Runtime, and Exclude Keywords. The **+** button adds a per-service filter row (catch-all "All Other Types" plus one row per render service, so e.g. Arnold can use looser thresholds than quicker services). + - **Frame actions** via the row's right-click menu: Tail/View/View Last Log, Retry / Eat / Kill, Log Stuck Frame (and Log and Retry / Eat / Kill), Frame Not Stuck, Add Job to Excludes / Exclude and Remove Job, **Core Up** (raise the layer's minimum cores), and View Host. + - **Job actions** via the job header's right-click menu: View Comments, Job Not Stuck, Add Job to Excludes / Exclude and Remove Job, and **Core Up** across the job's stuck layers. + +32. **Facility Service Defaults (CueCommander → Services):** - A facility-wide service-defaults editor at `/services`, the CueWeb equivalent of CueGUI's Facility Service Defaults tab. Reached from the CueCommander menu / sidebar entry. It edits the default resource requirements applied to a layer when it runs a given service (for example `arnold`, `maya`, `nuke`, or `shell`). - Two panes: a left list of services (with **New** / **Del**) and a right edit form with Name, Threadable, Min/Max Threads (100 = 1 thread), Min Memory MB, Min Gpu Memory MB, Timeout, Timeout LLU, OOM Increase MB, and Tags (predefined checkboxes or a Custom Tags free-text toggle). - Because these are facility-wide defaults, **Save** asks for a confirmation before creating or updating, and **Del** confirms before removing a service; a toast reports the result. -32. **Subscriptions (CueCommander → Subscriptions):** +33. **Subscriptions (CueCommander → Subscriptions):** - A per-show subscriptions table at `/subscriptions`, the CueWeb equivalent of CueGUI's CueCommander Subscriptions window. Pick a show from the dropdown to list its subscriptions, one row per allocation, with columns Alloc, Usage, Size, Burst, and Used. A subscription is a show's reservation against an allocation: **Size** is the guaranteed cores, **Burst** the maximum it may temporarily use. - **Add Subscription** subscribes the show to another allocation (Size + Burst); **Show Properties** opens the same four-tab dialog as the Shows page. - **Row actions** via the right-click menu: **Edit Subscription Size...** (with a billing confirmation), **Edit Subscription Burst...**, and **Delete Subscription**. -33. **Subscription Graphs (CueCommander → Subscription Graphs):** +34. **Subscription Graphs (CueCommander → Subscription Graphs):** - A visual view at `/subscription-graphs`, the CueWeb equivalent of CueGUI's CueCommander Subscription Graphs window. A **Shows** multi-select (All Shows / Clear / per-show) chooses which shows to graph; each gets one horizontal bar per subscription. - Each bar is scaled to the allocation's total cores and color-coded like CueGUI (legend at the top): sky-blue allocation capacity, yellow-green in-use cores, a blue size marker and a red burst marker. Hovering shows the exact values. - **Row actions** via the right-click menu match the Subscriptions table plus **Add new subscription**; right-clicking a show with no subscriptions offers **Add new subscription** to create the first one. -34. **Limits (CueCommander → Limits):** +35. **Limits (CueCommander → Limits):** - A limits table at `/limits`, the CueWeb equivalent of CueGUI's CueCommander Limits window. Reached from the CueCommander menu / sidebar entry. - Columns: Limit Name, Max Value, Current Running. Auto-refreshes every 30 seconds, with a **Refresh** button for an immediate reload. - **Add Limit** dialog creates a new limit (max value starts at 0). diff --git a/docs/_docs/quick-starts/quick-start-cueweb.md b/docs/_docs/quick-starts/quick-start-cueweb.md index 662e4c8ac..8d681f60c 100644 --- a/docs/_docs/quick-starts/quick-start-cueweb.md +++ b/docs/_docs/quick-starts/quick-start-cueweb.md @@ -258,6 +258,15 @@ The job right-click menu, and the tabbed Job Details page it can open: ![CueWeb job search](/assets/images/cueweb/cueweb_cuetopia_monitor_jobs_search_jobs.png) +### Find stuck frames + +Open **CueCommander → Stuck Frame** to find running frames that look hung - frames that keep running but have stopped writing to their log. The page scans every running frame and lists the ones that cross the detection thresholds (Last Log Update vs. runtime), grouped by job. + +![CueWeb Stuck Frames page](/assets/images/cueweb/cueweb_cuecommander_stuck_frame.png) + +- Tune the filter bar (**Min LLU**, **% of Run Since LLU**, **Total Runtime**) to control how aggressively frames are flagged; the **+** button adds a per-service filter row so long-running services (e.g. Arnold) can use looser limits than quicker ones. +- Right-click a frame for **Retry / Eat / Kill**, **View Log**, or **Core Up** (raise the layer's minimum cores - a common fix when a frame is starved for resources). Right-click a job header for job-wide actions. + --- diff --git a/docs/_docs/reference/cueweb.md b/docs/_docs/reference/cueweb.md index ebd9fa49f..2d2f6db1b 100644 --- a/docs/_docs/reference/cueweb.md +++ b/docs/_docs/reference/cueweb.md @@ -418,6 +418,21 @@ Clicking a show name opens `/shows/[showName]` (`cueweb/app/shows/[showName]/pag | **Reparent** | Dragging a group onto another calls `reparentGroups()` → `/api/group/action/reparentgroups` → `job.GroupInterface/ReparentGroups`; dragging a job onto a group calls `reparentJobs()` → `/api/group/action/reparentjobs` → `job.GroupInterface/ReparentJobs`. Drop targets are validated client-side (no self/descendant cycles, no same-parent no-ops), and reparents are serialized one at a time and rolled back on a failed RPC. | | **Refresh** | The header **Refresh** button remounts the tree to reload groups and jobs. | +### Stuck Frames + +A stuck-frame finder at `/stuck-frames` (`cueweb/app/stuck-frames/page.tsx`), the CueWeb equivalent of CueGUI's CueCommander Stuck Frame window (`StuckFramePlugin`). Reached from **CueCommander → Stuck Frame** (header dropdown and sidebar). + +![CueWeb Stuck Frames page](/assets/images/cueweb/cueweb_cuecommander_stuck_frame.png) + +| Behavior | Description | +|----------|-------------| +| **Data source** | `getStuckFrames()` (`app/utils/get_utils.ts`) → `/api/stuck-frames` aggregates every RUNNING frame across unfinished jobs server-side, each stamped with its `service`, `avgFrameSec`, `layerId`, and `layerMinCores`. The **Last Line** column is fetched per frame via `getStuckFrameLastLine()` → `/api/stuck-frames/lastline` (best-effort; empty when the log filesystem isn't mounted). | +| **Detection** | Applied client-side so the filters stay instant (CueGUI parity). A frame is stuck when `lluTime` age `> minLlu*60` **and** `percentStuck*100 > percentStuck` threshold **and** `runtime > avg*avgComp/100` **and** `percentStuck < 1.1` **and** `runtime > 500`, where `percentStuck = lluAge / runtime`. | +| **Filters** | Filter row 0 is the catch-all; rows added via **+** target one `service` each and override the catch-all for matching frames (`pickFilter`). `SERVICE_DEFAULTS` seeds `preprocess` / `nuke` / `arnold` thresholds; `makeServiceFilter` falls back to `DEFAULT_FILTER` otherwise. Filters persist to `localStorage["cueweb.stuck-frames.filters"]`. Exclude Keywords are a comma-separated regex list matched against job/layer name. | +| **Columns** | Name (grouped under a job header), comment marker, Frame, Host, LLU, Runtime, % Stuck, Average, Last Line. | +| **Frame actions** | Right-click menu: Tail/View/View Last Log (open the log viewer); Retry / Eat / Kill via `retryFrames` / `eatFrames` / `killFrames` (`/api/frame/action/{retry,eat,kill}`); Log Stuck Frame and Log and Retry/Eat/Kill (client-side log export then the action); Frame Not Stuck and Job/Frame exclude (client-side hide + Exclude Keywords); **Core Up** via `setLayerMinCores()` → `/api/layer/action/setmincores`; View Host. | +| **Job actions** | Right-click a job header: View Comments, Job Not Stuck, Add Job to Excludes / Exclude and Remove Job (client-side), and **Core Up** applied across the job's stuck layers (one `setLayerMinCores` per layer). | + ### Facility Service Defaults A facility-wide service-defaults editor at `/services` (`cueweb/app/services/page.tsx` + `components/ui/service-defaults-form.tsx`), the CueWeb equivalent of CueGUI's Facility Service Defaults tab (`ServiceDialog` / `ServiceForm`). Reached from **CueCommander → Services** (header dropdown and sidebar). @@ -1245,7 +1260,9 @@ Layout, left to right: - Shows (`/shows`) - implemented; shows stats table with Create Show, Show Properties, and Create Subscription, plus a per-show group-tree detail page at `/shows/[showName]` (see [Shows](#shows)). - - Stuck Frame (`/stuck-frames`) + - Stuck Frame (`/stuck-frames`) - implemented; stuck-frame finder with + per-service detection filters and frame/job actions including Core Up + (see [Stuck Frames](#stuck-frames)). - Subscription Graphs (`/subscription-graphs`) - Subscriptions (`/subscriptions`) diff --git a/docs/_docs/tutorials/cueweb-tutorial.md b/docs/_docs/tutorials/cueweb-tutorial.md index cf3d586c0..0ff82c871 100644 --- a/docs/_docs/tutorials/cueweb-tutorial.md +++ b/docs/_docs/tutorials/cueweb-tutorial.md @@ -390,6 +390,22 @@ The frame right-click menu, and the confirmation toast shown after an action: 6. Right-click the frame and select "Retry" 7. Watch the frame change from red to gray (pending) +### Finding and clearing stuck frames + +Failed frames turn red, but a *stuck* frame is trickier: it keeps running (gray-green) while no longer making progress - the process is alive but has stopped writing to its log. CueWeb's **Stuck Frames** page finds these for you. + +1. Open **CueCommander → Stuck Frame** from the header or sidebar. + + ![CueWeb Stuck Frames page](/assets/images/cueweb/cueweb_cuecommander_stuck_frame.png) + +2. The page scans every running frame and lists the ones whose log has gone silent relative to their runtime, grouped under their job. Read the **LLU** (time since the last log line), **Runtime**, and **% Stuck** columns to judge each frame - a high **% Stuck** means the log has been quiet for most of the run. +3. If nothing shows up, loosen the filters at the top - lower **Min LLU** or **% of Run Since LLU**. To tune detection per render type, click **+** to add a service-specific filter row (so e.g. Arnold frames, which legitimately run long, use looser limits than quick ones). +4. Right-click a frame you believe is hung and pick an action: + - **View Log** / **View Last Log** to confirm it has really stalled. + - **Retry** to requeue it, **Eat** to mark it done, or **Kill** to stop it. + - **Core Up** to raise the layer's minimum cores when a frame is starved for resources. +5. Use **Frame Not Stuck** (or **Job Not Stuck**) to dismiss a false positive, or **Add Job to Excludes** to stop a known-noisy job from appearing. + --- ## Advanced Search and Filtering diff --git a/docs/_docs/user-guides/cueweb-user-guide.md b/docs/_docs/user-guides/cueweb-user-guide.md index b831d25a9..b8eb21038 100644 --- a/docs/_docs/user-guides/cueweb-user-guide.md +++ b/docs/_docs/user-guides/cueweb-user-guide.md @@ -1162,6 +1162,81 @@ Clicking a show name (or navigating to `/shows/`) opens the show's **group --- +## Stuck Frames + +The **Stuck Frames** page (CueCommander → Stuck Frame in the sidebar or header) helps you find running frames that appear to be hung - frames that keep running but have stopped writing to their log. It is the CueWeb equivalent of CueGUI's CueCommander Stuck Frame window. + +Open it from the **CueCommander** menu (or the matching entry in the left sidebar). + +![Stuck Frame entry in the CueCommander menu](/assets/images/cueweb/cueweb_cuecommander_stuck_frame_menu.png) + +The page scans every running frame across all active jobs and lists the ones that match the current detection filters, grouped under their job. + +![CueWeb Stuck Frames page](/assets/images/cueweb/cueweb_cuecommander_stuck_frame.png) + +### Stuck Frame columns + +| Column | Description | +|--------|-------------| +| Name | Layer name (rows are grouped under a job header) | +| Frame | Frame number | +| Host | Host the frame is running on | +| LLU | Time since the **L**ast **L**og **U**pdate - how long the log has been silent | +| Runtime | How long the frame has been running | +| % Stuck | LLU as a percentage of runtime - the closer to 100%, the more likely it is hung | +| Average | The layer's average frame time, for comparison | +| Last Line | The last line written to the frame's log | + +The table auto-refreshes on a timer (toggle **Auto-refresh** off to freeze it), and **Refresh** reloads immediately. **Clear** resets any rows or jobs you have manually hidden. + +### Detection filters + +The filter bar at the top controls which frames are flagged. A frame is considered stuck only when its log has been silent longer than **Min LLU**, its **% of Run Since LLU** exceeds the threshold, and it has been running long enough relative to its layer average. Your filter settings are saved per browser. + +- **% of Run Since LLU** - minimum percentage of the runtime spent with no log activity. +- **Min LLU** - minimum time (minutes) the log must have been silent. +- **% Avg Completion** - how far past the layer's average frame time the frame must be. +- **Total Runtime** - minimum runtime threshold. +- **Exclude Keywords** - comma-separated terms; frames whose job or layer name matches are skipped. +- **Enable** - turn a filter row on or off. + +Click the **+** button to add a **service-specific** filter. The first row is the catch-all (labelled **All** on its own, or **All Other Types** once service rows exist); each added row targets one render **service** from a dropdown and applies its own thresholds, so long-running services (e.g. Arnold) can use looser limits than quick ones. A frame is matched to the most specific row for its service. Use the **×** button to remove a service row. + +![Adding a service-specific Stuck Frame filter](/assets/images/cueweb/cueweb_cuecommander_stuck_frame_add_service_filter.png) + +### Frame actions + +Right-click a frame row to open its actions menu. + +![Stuck frame row context menu](/assets/images/cueweb/cueweb_cuecommander_stuck_frame_layer_menu_options.png) + +- **Tail Log / View Log / View Last Log** - open the frame's log. +- **Retry / Eat / Kill** - the standard frame operations. +- **Log Stuck Frame** - export the frame's details for a report; **Log and Retry / Log and Eat / Log and Kill** combine the export with an action. +- **Frame Not Stuck** - hide this frame from the list (it is not really stuck). +- **Add Job to Excludes** / **Exclude and Remove Job** - add the job's name to the exclude keywords (and optionally drop it from the list now). +- **Core Up** - raise the minimum cores on the frame's layer (see below). +- **View Host** - open the host's detail page. + +### Job actions + +Right-click a job header row for job-level actions. + +![Stuck frame job context menu](/assets/images/cueweb/cueweb_cuecommander_stuck_frame_job_menu_options.png) + +- **View Comments** - open the job's comments page. +- **Job Not Stuck** - hide the whole job from the list. +- **Add Job to Excludes** / **Exclude and Remove Job** - exclude the job by name. +- **Core Up** - raise the minimum cores across the job's stuck layers. + +### Core Up + +**Core Up** opens a dialog to increase the minimum cores reserved for the affected layer(s) - a common remedy when a frame is stuck because it is starved for cores. Enter the new core count and click **Apply**. + +![Core Up dialog](/assets/images/cueweb/cueweb_cuecommander_stuck_frame_core_up_popup.png) + +--- + ## Facility Service Defaults The **Facility Service Defaults** page (CueCommander → Services in the sidebar or header) edits the facility-wide service templates - the default resource requirements that apply to a layer when it runs a given service (for example `arnold`, `maya`, `nuke`, or `shell`). It is the CueWeb equivalent of CueGUI's Facility Service Defaults tab. diff --git a/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame.png b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame.png new file mode 100644 index 000000000..6146d7a76 Binary files /dev/null and b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame.png differ diff --git a/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_add_service_filter.png b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_add_service_filter.png new file mode 100644 index 000000000..c576eb360 Binary files /dev/null and b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_add_service_filter.png differ diff --git a/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_add_service_filter_dark.png b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_add_service_filter_dark.png new file mode 100644 index 000000000..e6b73ce95 Binary files /dev/null and b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_add_service_filter_dark.png differ diff --git a/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_core_up_popup.png b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_core_up_popup.png new file mode 100644 index 000000000..30cbd351b Binary files /dev/null and b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_core_up_popup.png differ diff --git a/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_core_up_popup_dark.png b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_core_up_popup_dark.png new file mode 100644 index 000000000..846d68aa0 Binary files /dev/null and b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_core_up_popup_dark.png differ diff --git a/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_dark.png b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_dark.png new file mode 100644 index 000000000..a41d6a2f5 Binary files /dev/null and b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_dark.png differ diff --git a/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_job_menu_options.png b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_job_menu_options.png new file mode 100644 index 000000000..7b498263b Binary files /dev/null and b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_job_menu_options.png differ diff --git a/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_job_menu_options_dark.png b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_job_menu_options_dark.png new file mode 100644 index 000000000..097fcba79 Binary files /dev/null and b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_job_menu_options_dark.png differ diff --git a/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_layer_menu_options.png b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_layer_menu_options.png new file mode 100644 index 000000000..26ac0a202 Binary files /dev/null and b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_layer_menu_options.png differ diff --git a/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_layer_menu_options_dark.png b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_layer_menu_options_dark.png new file mode 100644 index 000000000..e0d25dea1 Binary files /dev/null and b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_layer_menu_options_dark.png differ diff --git a/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_menu.png b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_menu.png new file mode 100644 index 000000000..123d143c5 Binary files /dev/null and b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_menu.png differ diff --git a/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_menu_dark.png b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_menu_dark.png new file mode 100644 index 000000000..3e97273f5 Binary files /dev/null and b/docs/assets/images/cueweb/cueweb_cuecommander_stuck_frame_menu_dark.png differ