From b69e27ac9edf3714615ec267659833147c3625a9 Mon Sep 17 00:00:00 2001 From: Shai Almog <67850168+shai-almog@users.noreply.github.com> Date: Fri, 12 Jun 2026 06:09:15 +0300 Subject: [PATCH 1/3] ci: capture faulting stacks when a screenshot suite hangs The mac-native / iOS screenshot jobs flake with 'N of 128 screenshots not produced': the suite stops emitting mid-run and the runner times out waiting for CN1SS:SUITE:FINISHED. Artifact forensics show the app is not idle when this happens -- ParparVM's SignalHandler converts SIGSEGV into a Java NPE and returns, so a thread that faulted outside a Java try frame re-executes the faulting instruction forever ('We had a signal 11' spam in device-runner.log, observed from the UIKit main thread). On suite timeout both runners now: - 'sample' the live app process into app-hang-sample.txt. Because the crashed thread keeps re-faulting at the same PC, the sample contains the exact faulting stack; for genuine deadlocks it captures every thread's wait state. - log when the signal-handler loop signature is present in the app log. - collect crash reports written to ~/Library/Logs/DiagnosticReports during the run (covers the process-died-outright mode). Diagnostics only; no behavior change on the success path. Co-Authored-By: Claude Fable 5 --- scripts/run-ios-ui-tests.sh | 45 ++++++++++++++++++++++++++++++ scripts/run-mac-native-ui-tests.sh | 40 ++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/scripts/run-ios-ui-tests.sh b/scripts/run-ios-ui-tests.sh index a787b3b524..425e88d9a3 100755 --- a/scripts/run-ios-ui-tests.sh +++ b/scripts/run-ios-ui-tests.sh @@ -730,6 +730,39 @@ APP_PROCESS_NAME="${WRAPPER_NAME%.app}" echo "App Install : $(( (INSTALL_END - INSTALL_START) * 1000 )) ms" >> "$ARTIFACTS_DIR/ios-test-stats.txt" echo "App Launch : $(( (LAUNCH_END - LAUNCH_START) * 1000 )) ms" >> "$ARTIFACTS_DIR/ios-test-stats.txt" +# Timestamp marker so crash reports written during this run can be picked +# out of ~/Library/Logs/DiagnosticReports afterwards (find -newer). The +# simulator app is a host process, so its crash reports land on the host. +LAUNCH_MARKER="$ARTIFACTS_DIR/.launch-marker" +touch "$LAUNCH_MARKER" +APP_EXECUTABLE_NAME="$(/usr/libexec/PlistBuddy -c 'Print CFBundleExecutable' "$APP_BUNDLE_PATH/Info.plist" 2>/dev/null || true)" + +# When the suite times out the app is usually not idle: ParparVM's +# SignalHandler (CodenameOne_GLAppDelegate.m) converts SIGSEGV into a Java +# NPE and returns, so a thread that faulted outside a Java try frame +# re-executes the faulting instruction forever ("We had a signal 11" spam +# in the device log). The simulator app is a plain host process, so a +# `sample` taken at timeout contains the exact faulting stack (and, for +# genuine deadlocks, every thread's wait state). +capture_hang_diagnostics() { + local pid spam + if [ -n "$APP_EXECUTABLE_NAME" ]; then + pid="$(pgrep -x "$APP_EXECUTABLE_NAME" 2>/dev/null | head -n 1 || true)" + else + pid="" + fi + if [ -n "$pid" ]; then + ri_log "Sampling hung app (pid=$pid) -> app-hang-sample.txt" + sample "$pid" 5 -file "$ARTIFACTS_DIR/app-hang-sample.txt" >/dev/null 2>&1 || true + else + ri_log "No live ${APP_EXECUTABLE_NAME:-} process found to sample" + fi + spam="$(grep -c 'We had a signal' "$TEST_LOG" 2>/dev/null || echo 0)" + if [ "${spam:-0}" -gt 0 ]; then + ri_log "Signal-handler loop detected: ${spam} 'We had a signal' lines in device log (a crashed thread is spinning in ParparVM's SignalHandler; see app-hang-sample.txt for the faulting stack)" + fi +} + END_MARKER="CN1SS:SUITE:FINISHED" # Per-suite budget (seconds). The 300 -> 600 bump from earlier landed # back when the suite was ~37 tests; it has since grown to ~90, and the @@ -750,6 +783,7 @@ while true; do NOW="$(date +%s)" if [ $(( NOW - START_TIME )) -ge $TIMEOUT_SECONDS ]; then ri_log "STAGE:TIMEOUT -> DeviceRunner did not emit completion marker within ${TIMEOUT_SECONDS}s" + capture_hang_diagnostics break fi sleep 5 @@ -769,6 +803,17 @@ xcrun simctl spawn "$SIM_DEVICE_ID" \ --predicate '(composedMessage CONTAINS "CN1SS") OR (eventMessage CONTAINS "CN1SS")' \ > "$FALLBACK_LOG" 2>/dev/null || true +# Collect any crash reports the OS wrote for the app during this run +# (simulator app crashes report to the host's DiagnosticReports). +CRASH_REPORT_DIR="$HOME/Library/Logs/DiagnosticReports" +if [ -d "$CRASH_REPORT_DIR" ] && [ -n "$APP_EXECUTABLE_NAME" ]; then + while IFS= read -r crash_file; do + [ -n "$crash_file" ] || continue + ri_log "Collected crash report: $(basename "$crash_file")" + cp -f "$crash_file" "$ARTIFACTS_DIR/" 2>/dev/null || true + done < <(find "$CRASH_REPORT_DIR" -maxdepth 1 -name "${APP_EXECUTABLE_NAME}*" -newer "$LAUNCH_MARKER" 2>/dev/null) +fi + BASE64_STATS_FILE="$ARTIFACTS_DIR/base64-performance-stats.txt" extract_base64_stats "$BASE64_STATS_FILE" "$TEST_LOG" "$FALLBACK_LOG" if [ -s "$BASE64_STATS_FILE" ]; then diff --git a/scripts/run-mac-native-ui-tests.sh b/scripts/run-mac-native-ui-tests.sh index a61663bc15..2c56042d7e 100755 --- a/scripts/run-mac-native-ui-tests.sh +++ b/scripts/run-mac-native-ui-tests.sh @@ -324,6 +324,10 @@ pkill -x "$APP_PROCESS_NAME" >/dev/null 2>&1 || true sleep 1 rm_log "Launching Mac Catalyst app via LaunchServices: $APP_BUNDLE_PATH" +# Timestamp marker so crash reports written during this run can be picked +# out of ~/Library/Logs/DiagnosticReports afterwards (find -newer). +LAUNCH_MARKER="$SCREENSHOT_TMP_DIR/.launch-marker" +touch "$LAUNCH_MARKER" LAUNCH_START=$(date +%s) # `open -W -n -F` waits for the app to terminate, forces a fresh # instance, and skips state restoration. `--stdout / --stderr` pipe the @@ -354,6 +358,28 @@ else rm_log "Warning: could not resolve pid for $APP_PROCESS_NAME" fi +# When the suite times out the app is usually not idle: ParparVM's +# SignalHandler (CodenameOne_GLAppDelegate.m) converts SIGSEGV into a Java +# NPE and returns, so a thread that faulted outside a Java try frame +# re-executes the faulting instruction forever -- that is the "We had a +# signal 11" spam seen in device-runner.log when the suite "hangs". A +# process sample taken at timeout therefore contains the exact faulting +# stack (and, for genuine deadlocks, every thread's wait state). +capture_hang_diagnostics() { + local pid spam + pid="$(pgrep -x "$APP_PROCESS_NAME" 2>/dev/null | head -n 1 || true)" + if [ -n "$pid" ]; then + rm_log "Sampling hung app (pid=$pid) -> app-hang-sample.txt" + sample "$pid" 5 -file "$ARTIFACTS_DIR/app-hang-sample.txt" >/dev/null 2>&1 || true + else + rm_log "No live $APP_PROCESS_NAME process found to sample" + fi + spam="$(grep -c 'We had a signal' "$TEST_LOG" 2>/dev/null || echo 0)" + if [ "${spam:-0}" -gt 0 ]; then + rm_log "Signal-handler loop detected: ${spam} 'We had a signal' lines in app stdout (a crashed thread is spinning in ParparVM's SignalHandler; see app-hang-sample.txt for the faulting stack)" + fi +} + END_MARKER="CN1SS:SUITE:FINISHED" TIMEOUT_SECONDS="${CN1SS_SUITE_TIMEOUT_SECONDS:-1500}" START_TIME="$(date +%s)" @@ -377,6 +403,7 @@ while true; do NOW="$(date +%s)" if [ $(( NOW - START_TIME )) -ge $TIMEOUT_SECONDS ]; then rm_log "STAGE:TIMEOUT -> DeviceRunner did not emit completion marker within ${TIMEOUT_SECONDS}s" + capture_hang_diagnostics break fi sleep 5 @@ -413,6 +440,19 @@ fi wait "$APP_PID" 2>/dev/null || true APP_PID=0 +# Collect any crash reports the OS wrote for the app during this run +# (covers the case where the process died outright instead of spinning in +# the signal handler -- LaunchServices apps report to DiagnosticReports, +# not to our stdout pipe). +CRASH_REPORT_DIR="$HOME/Library/Logs/DiagnosticReports" +if [ -d "$CRASH_REPORT_DIR" ]; then + while IFS= read -r crash_file; do + [ -n "$crash_file" ] || continue + rm_log "Collected crash report: $(basename "$crash_file")" + cp -f "$crash_file" "$ARTIFACTS_DIR/" 2>/dev/null || true + done < <(find "$CRASH_REPORT_DIR" -maxdepth 1 -name "${APP_PROCESS_NAME}*" -newer "$LAUNCH_MARKER" 2>/dev/null) +fi + # The app has exited; stop the WebSocket server and adopt whatever it # received. The server wrote one .png per delivered screenshot into # $WS_RAW_DIR. When WS delivered at least one image we use that set directly From a4e12a8c37a77750a2b27d6a7e9a07e904d39418 Mon Sep 17 00:00:00 2001 From: Shai Almog <67850168+shai-almog@users.noreply.github.com> Date: Fri, 12 Jun 2026 07:00:32 +0300 Subject: [PATCH 2/3] fix(parparvm): close two heap-collection races behind random suite crashes A sample of a wedged Mac Catalyst screenshot run caught a dying Java thread aborting inside libmalloc (POINTER_BEING_FREED_WAS_NOT_ALLOCATED) under markDeadThread -> collectThreadResources -> placeObjectInHeapCollection, with the GC, the EDT and thread-spawn all piled up behind the critical section it still held. Two bugs in placeObjectInHeapCollection's rarely-taken grow path: 1. Unsynchronized concurrent callers. The GC mark migration waits for a thread's threadActive to drop before migrating its pendingHeapAllocations -- but a thread that finishes runImpl drops threadActive through markDeadThread, which migrates the same buffer concurrently under the critical section the GC never takes. Both sides double-place the same objects and race the grow-and-free of allObjectsInHeap: concurrent grows double-free the old array (the captured abort), and a stale read of the freed array is a use-after-free. The GC migration now takes the critical section and re-checks the thread slot; if the thread died meanwhile, markDeadThread already migrated everything under the same lock. 2. The grow branch left pos at -1, so the placed object's __heapPosition was never recorded. A later reference-counted free could not null its slot (removeObjectFromHeapCollection returns JAVA_FALSE) yet the object was freed anyway, leaving a dangling pointer in allObjectsInHeap for the next sweep to dereference. Also defer freeing the replaced array by one growth cycle since the sweep and the refcount removal path read allObjectsInHeap without the critical section. Co-Authored-By: Claude Fable 5 --- vm/ByteCodeTranslator/src/cn1_globals.m | 59 ++++++++++++++++++------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/vm/ByteCodeTranslator/src/cn1_globals.m b/vm/ByteCodeTranslator/src/cn1_globals.m index 57e442002f..8dd7e07b7c 100644 --- a/vm/ByteCodeTranslator/src/cn1_globals.m +++ b/vm/ByteCodeTranslator/src/cn1_globals.m @@ -546,11 +546,22 @@ void placeObjectInHeapCollection(JAVA_OBJECT obj) { memset(tmpAllObjectsInHeap + sizeOfAllObjectsInHeap, 0, sizeof(JAVA_OBJECT) * sizeOfAllObjectsInHeap); memcpy(tmpAllObjectsInHeap, allObjectsInHeap, sizeof(JAVA_OBJECT) * sizeOfAllObjectsInHeap); sizeOfAllObjectsInHeap *= 2; + // Defer freeing the replaced array by one growth cycle: the sweep and the + // reference-counting removal path read allObjectsInHeap without taking the + // critical section, so an immediate free can pull the array out from under + // an in-flight read. Growths double the capacity so they are rare, and at + // most one stale array is retained. + if(oldAllObjectsInHeap != 0) { + free(oldAllObjectsInHeap); + } oldAllObjectsInHeap = allObjectsInHeap; allObjectsInHeap = tmpAllObjectsInHeap; - allObjectsInHeap[currentSizeOfAllObjectsInHeap] = obj; + // record the real slot -- leaving pos at -1 here left the object's + // __heapPosition unset, so a later reference-counted free could not null + // its slot and the sweep would dereference the dangling pointer. + pos = currentSizeOfAllObjectsInHeap; + allObjectsInHeap[pos] = obj; currentSizeOfAllObjectsInHeap++; - free(oldAllObjectsInHeap); } else { allObjectsInHeap[pos] = obj; } @@ -628,23 +639,37 @@ void codenameOneGCMark() { } } - // place allocations from the local thread into the global heap list - if (!t->lightweightThread) { - // For native threads, we need to actually lock them while we traverse the - // heap allocations because we can't use the usual locking mechanisms on - // them. - lockThreadHeapMutex(); - } - for(int heapTrav = 0 ; heapTrav < t->heapAllocationSize ; heapTrav++) { - JAVA_OBJECT obj = (JAVA_OBJECT)t->pendingHeapAllocations[heapTrav]; - if(obj) { - t->pendingHeapAllocations[heapTrav] = 0; - placeObjectInHeapCollection(obj); + // place allocations from the local thread into the global heap list. + // The critical section serializes this migration against + // markDeadThread/collectThreadResources: the pause-wait above ends when + // threadActive drops, but a thread that finishes runImpl drops + // threadActive through markDeadThread, so without the lock both sides + // migrate the same pendingHeapAllocations concurrently -- double-placing + // objects and racing placeObjectInHeapCollection's grow-and-free of + // allObjectsInHeap (double free / use-after-free, observed as random + // SIGSEGV or a libmalloc abort that wedges the VM). If the slot no + // longer holds this thread it died and markDeadThread already migrated + // everything under this same lock; skip. + lockCriticalSection(); + if(allThreads[iter] == t) { + if (!t->lightweightThread) { + // For native threads, we need to actually lock them while we traverse the + // heap allocations because we can't use the usual locking mechanisms on + // them. + lockThreadHeapMutex(); + } + for(int heapTrav = 0 ; heapTrav < t->heapAllocationSize ; heapTrav++) { + JAVA_OBJECT obj = (JAVA_OBJECT)t->pendingHeapAllocations[heapTrav]; + if(obj) { + t->pendingHeapAllocations[heapTrav] = 0; + placeObjectInHeapCollection(obj); + } + } + if (!t->lightweightThread) { + unlockThreadHeapMutex(); } } - if (!t->lightweightThread) { - unlockThreadHeapMutex(); - } + unlockCriticalSection(); // this is a thread that allocates a lot and might demolish RAM. We will hold it until the sweep is finished... From 13ea640851f0644b192933e690c70fe1620346e6 Mon Sep 17 00:00:00 2001 From: Shai Almog <67850168+shai-almog@users.noreply.github.com> Date: Fri, 12 Jun 2026 07:00:32 +0300 Subject: [PATCH 3/3] fix(ios): retain ExecutableOp's mutable-image render target The Phase 3 v2 mutable-image pipeline tags queued ExecutableOps with the GLUIImage they should render into, but the target ivar was __unsafe_unretained. The main-thread drawFrame drain dereferences it after the EDT queued the op, so a mutable image deallocated in between (Java-side GC finalizing the Image) left a dangling pointer. Caught locally as -[DrawTextureAlphaMask mtlMutableTexture]: unrecognized selector ... CN1MetalBeginMutableImageDraw / drawFrame: when the freed GLUIImage's memory had been reused by another op; with less lucky reuse it is a straight SIGSEGV mid-frame, matching the random mid-suite crashes in the mac-native and iOS Metal screenshot CI jobs. setTarget now retains (released in dealloc), exactly like the ops' image ivars (DrawImage.img et al). The final release can now happen on the main thread during the drain, which is the safe place for a UIKit-backed object. Co-Authored-By: Claude Fable 5 --- Ports/iOSPort/nativeSources/ExecutableOp.h | 8 ++++++-- Ports/iOSPort/nativeSources/ExecutableOp.m | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/Ports/iOSPort/nativeSources/ExecutableOp.h b/Ports/iOSPort/nativeSources/ExecutableOp.h index 78d0fa3ed0..5a11a03622 100644 --- a/Ports/iOSPort/nativeSources/ExecutableOp.h +++ b/Ports/iOSPort/nativeSources/ExecutableOp.h @@ -60,8 +60,12 @@ green:((float)((rgbValue >> 8) & 0xff))/255.0 blue:((float)(rgbValue & 0xff))/25 // Phase 3: render target for this op. nil = screen drawable (default, // existing GL/Metal screen pipeline). non-nil = a mutable image whose // backing MTLTexture should receive this op. drawFrame walks the queue - // and switches encoders when target changes between ops. - __unsafe_unretained GLUIImage *target; + // and switches encoders when target changes between ops. Retained by + // setTarget (released in dealloc) -- the main-thread drain runs after + // the EDT enqueued the op, so an unretained target could be deallocated + // in between. Plain ivar = __strong under ARC, manual retain otherwise, + // matching the ops' image ivars (e.g. DrawImage.img). + GLUIImage *target; #endif } diff --git a/Ports/iOSPort/nativeSources/ExecutableOp.m b/Ports/iOSPort/nativeSources/ExecutableOp.m index ed081aa8bb..305db130b9 100644 --- a/Ports/iOSPort/nativeSources/ExecutableOp.m +++ b/Ports/iOSPort/nativeSources/ExecutableOp.m @@ -74,6 +74,9 @@ -(void)execute { #ifndef CN1_USE_ARC -(void)dealloc { +#ifdef CN1_USE_METAL + [target release]; +#endif [super dealloc]; } #endif @@ -134,6 +137,17 @@ -(GLUIImage*)target { return target; } -(void)setTarget:(GLUIImage*)t { + // The drawFrame drain dereferences this on the main thread after the EDT + // queued the op. An unretained mutable image can be deallocated in between + // (Java-side GC finalizing the Image), leaving a dangling pointer that + // surfaced as unrecognized-selector / SIGSEGV mid-frame. Retain like the + // ops' image ivars (e.g. DrawImage.img). +#ifndef CN1_USE_ARC + if (t != target) { + [t retain]; + [target release]; + } +#endif target = t; } #endif