修复掉画面的bug

This commit is contained in:
2025-10-24 22:04:28 +08:00
parent fe3c19fb21
commit 23f63e42c8
12 changed files with 796 additions and 470 deletions

View File

@@ -6,7 +6,7 @@ import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError
from pathlib import Path
from typing import Dict, Optional, List
from typing import Dict, Optional, List, Any
import random
import socket
import http.client
@@ -55,6 +55,15 @@ class DeviceInfo:
# WDA Ready 等待HTTP 轮询方式,不触发 xctest
WDA_READY_TIMEOUT = float(os.getenv("WDA_READY_TIMEOUT", "35.0"))
# WDA 轻量复位策略
MJPEG_BAD_THRESHOLD = int(os.getenv("MJPEG_BAD_THRESHOLD", "3")) # 连续几次 mjpeg 健康失败才重置 WDA
WDA_RESET_COOLDOWN = float(os.getenv("WDA_RESET_COOLDOWN", "10")) # WDA 复位冷却,避免风暴
# 防连坐参数(支持环境变量)
GLITCH_SUPPRESS_SEC = float(os.getenv("GLITCH_SUPPRESS_SEC", "6.0")) # 扫描异常后抑制移除的秒数
MASS_DROP_RATIO = float(os.getenv("MASS_DROP_RATIO", "0.6")) # 一次性丢失占比阈值
ABSENT_TICKS_BEFORE_REMOVE = int(os.getenv("ABSENT_TICKS_BEFORE_REMOVE", "3")) # 连续缺席轮数
def __init__(self):
# 自增端口游标仅作兜底扫描使用
self._port = 9110
@@ -69,7 +78,7 @@ class DeviceInfo:
# 并发保护 & 状态表
self._lock = threading.RLock()
self._port_by_udid: Dict[str, int] = {} # UDID -> 当前使用的本地端口
self._port_by_udid: Dict[str, int] = {} # UDID -> 当前使用的本地端口(映射 wdaScreenPort
self._pid_by_udid: Dict[str, int] = {} # UDID -> iproxy PID
# 抗抖
@@ -81,6 +90,17 @@ class DeviceInfo:
self._trusted_cache: Dict[str, float] = {} # udid -> expire_ts
self._wda_ok_cache: Dict[str, float] = {} # udid -> expire_ts
# 新增MJPEG 连续坏计数 + 最近一次 WDA 复位时间
self._mjpeg_bad_count: Dict[str, int] = {}
self._last_wda_reset: Dict[str, float] = {}
# 新增:按 UDID 的 /status 探测单飞锁,避免临时 iproxy 并发
self._probe_locks: Dict[str, threading.Lock] = {}
# 防连坐
self._scan_glitch_until = 0.0 # 截止到该时间前,认为扫描不可靠,跳过移除
self._absent_ticks: Dict[str, int] = {} # udid -> 连续缺席次数
LogManager.info("DeviceInfo init 完成;日志已启用", udid="system")
# ---------------- 主循环 ----------------
@@ -108,13 +128,70 @@ class DeviceInfo:
with self._lock:
known = set(self._models.keys())
# 真正移除(连续缺席超过宽限期)
# -------- 全局扫描异常检测(防连坐)--------
missing = [u for u in known if u not in online_now]
mass_drop = (len(known) > 0) and (
(len(online_now) == 0) or
(len(missing) / max(1, len(known)) >= self.MASS_DROP_RATIO)
)
if mass_drop:
self._scan_glitch_until = now + self.GLITCH_SUPPRESS_SEC
LogManager.method_warning(
f"检测到扫描异常known={len(known)}, online={len(online_now)}, "
f"missing={len(missing)},进入抑制窗口 {self.GLITCH_SUPPRESS_SEC}s",
method, udid="system"
)
# 真正移除(仅在非抑制窗口内 + 连续缺席达到阈值 才移除)
for udid in list(known):
if udid in online_now:
# 在线:清空缺席计数
self._absent_ticks.pop(udid, None)
continue
# 离线:记录一次缺席
miss = self._absent_ticks.get(udid, 0) + 1
self._absent_ticks[udid] = miss
last = self._last_seen.get(udid, 0.0)
if udid not in online_now and (now - last) >= self.REMOVE_GRACE_SEC:
LogManager.info(f"设备判定离线(超过宽限期 {self.REMOVE_GRACE_SEC}slast_seen={last}", udid=udid)
exceed_grace = (now - last) >= self.REMOVE_GRACE_SEC
exceed_ticks = miss >= self.ABSENT_TICKS_BEFORE_REMOVE
# 抑制窗口内:跳过任何移除
if now < self._scan_glitch_until:
continue
if exceed_grace and exceed_ticks:
# --- 移除前的“可达性”反校验 ---
try:
with self._lock:
model = self._models.get(udid)
port = model.screenPort if model else -1
reachable = False
# 1) ip:port 的 MJPEG 是否还在
if port and port > 0 and self._health_check_mjpeg(port, timeout=0.8):
reachable = True
# 2) WDA /status 是否仍然正常
if not reachable and self._health_check_wda(udid):
reachable = True
if reachable:
# 误报:续命
self._last_seen[udid] = now
self._absent_ticks[udid] = 0
LogManager.method_info("离线误报:反校验可达,取消移除并续命", method, udid=udid)
continue
except Exception as e:
LogManager.method_warning(f"离线反校验异常:{e}", method, udid=udid)
LogManager.info(
f"设备判定离线(超过宽限期 {self.REMOVE_GRACE_SEC}s 且 连续缺席 {self._absent_ticks[udid]} 次)",
udid=udid
)
self._remove_device(udid)
self._last_topology_change_ts = now
# 清理计数
self._absent_ticks.pop(udid, None)
# 真正新增(连续在线超过稳定期)
new_candidates = [u for u in online_now if u not in known]
@@ -164,7 +241,7 @@ class DeviceInfo:
LogManager.method_warning(f"读取系统版本失败:{e}", method, udid=udid)
system_version_major = 0 # 保底
# === iOS>17先“被动探测WDA未运行则交给 IOSActivator并通过 HTTP 轮询等待 ===
# === iOS>17被动探测 WDA未运行则交给 IOSActivator并通过 HTTP 轮询等待 ===
if system_version_major > 17:
if self._wda_is_running(udid):
LogManager.method_info("检测到 WDA 已运行,直接映射", method, udid=udid)
@@ -209,6 +286,8 @@ class DeviceInfo:
model.ready = True
self._models[udid] = model
self._procs[udid] = proc
# 初始化计数
self._mjpeg_bad_count[udid] = 0
LogManager.method_info(f"设备添加完成port={port}, {w}x{h}@{s}", method, udid=udid)
self._manager_send(model)
@@ -227,6 +306,9 @@ class DeviceInfo:
self._wda_ok_cache.pop(udid, None)
self._last_seen.pop(udid, None)
self._first_seen.pop(udid, None)
self._mjpeg_bad_count.pop(udid, None)
self._last_wda_reset.pop(udid, None)
self._absent_ticks.pop(udid, None)
self._kill(proc)
if pid:
@@ -266,52 +348,96 @@ class DeviceInfo:
return False
# ======= WDA 探测/等待(仅走 iproxy+HTTP不触发 xctest =======
def _wda_http_status_ok(self, udid: str, timeout_sec: float = 1.2) -> bool:
"""临时 iproxy 转发到 wdaFunctionPortGET /status 成功视为 OK。"""
method = "_wda_http_status_ok"
tmp_port = self._pick_new_port()
proc = None
try:
cmd = [self._iproxy_path, "-u", udid, str(tmp_port), str(wdaFunctionPort)]
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
if not self._wait_until_listening(tmp_port, initial_timeout=0.8):
LogManager.method_info(f"WDA探测临时端口未监听{tmp_port}", method, udid=udid)
return False
def _get_probe_lock(self, udid: str) -> threading.Lock:
with self._lock:
lk = self._probe_locks.get(udid)
if lk is None:
lk = threading.Lock()
self._probe_locks[udid] = lk
return lk
conn = http.client.HTTPConnection("127.0.0.1", tmp_port, timeout=timeout_sec)
def _wda_http_status_ok(self, udid: str, timeout_sec: float = 1.2) -> bool:
"""起临时 iproxy 到 wdaFunctionPort探测 /status。增加单飞锁与严格清理。"""
method = "_wda_http_status_ok"
lock = self._get_probe_lock(udid)
if not lock.acquire(timeout=3.0):
# 有并发探测在进行,避免同时起多个 iproxy直接返回“未知→False”
LogManager.method_info("状态探测被并发锁抑制", method, udid=udid)
return False
try:
tmp_port = self._pick_new_port()
proc = None
try:
conn.request("GET", "/status")
resp = conn.getresponse()
_ = resp.read(256)
code = getattr(resp, "status", 0)
ok = 200 <= code < 400
LogManager.method_info(f"WDA探测/status code={code}, ok={ok}", method, udid=udid)
return ok
except Exception as e:
LogManager.method_info(f"WDA探测异常{e}", method, udid=udid)
cmd = [self._iproxy_path, "-u", udid, str(tmp_port), str(wdaFunctionPort)]
# --- Windows 下隐藏 iproxy 控制台 ---
creationflags = 0
startupinfo = None
if os.name == "nt":
creationflags = getattr(subprocess, "CREATE_NO_WINDOW", 0x08000000) | \
getattr(subprocess, "CREATE_NEW_PROCESS_GROUP", 0x00000200)
si = subprocess.STARTUPINFO()
si.dwFlags |= subprocess.STARTF_USESHOWWINDOW
si.wShowWindow = 0
startupinfo = si
proc = subprocess.Popen(
cmd,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
creationflags=creationflags,
startupinfo=startupinfo
)
if not self._wait_until_listening(tmp_port, initial_timeout=1.0):
LogManager.method_info(f"WDA探测临时端口未监听{tmp_port}", method, udid=udid)
return False
# /status 双重尝试,减少瞬态抖动
for _ in (1, 2):
try:
conn = http.client.HTTPConnection("127.0.0.1", tmp_port, timeout=timeout_sec)
conn.request("GET", "/status")
resp = conn.getresponse()
_ = resp.read(256)
code = getattr(resp, "status", 0)
ok = 200 <= code < 400
LogManager.method_info(f"WDA探测/status code={code}, ok={ok}", method, udid=udid)
try:
conn.close()
except Exception:
pass
if ok:
return True
time.sleep(0.2)
except Exception as e:
LogManager.method_info(f"WDA探测异常{e}", method, udid=udid)
time.sleep(0.2)
return False
finally:
try:
conn.close()
except Exception:
pass
finally:
if proc:
try:
p = psutil.Process(proc.pid)
p.terminate()
p.wait(timeout=0.6)
except Exception:
if proc:
try:
p.kill()
p = psutil.Process(proc.pid)
p.terminate()
try:
p.wait(timeout=1.2)
except psutil.TimeoutExpired:
p.kill()
p.wait(timeout=1.2)
except Exception:
pass
# 兜底强杀
try:
os.kill(proc.pid, signal.SIGTERM)
except Exception:
pass
finally:
try:
lock.release()
except Exception:
pass
def _wait_wda_ready_http(self, udid: str, total_timeout_sec: float = None, interval_sec: float = 0.6) -> bool:
"""
通过 _wda_http_status_ok 轮询等待 WDA Ready。
total_timeout_sec 默认取环境变量 WDA_READY_TIMEOUT默认 35s
"""
"""通过 _wda_http_status_ok 轮询等待 WDA Ready。"""
method = "_wait_wda_ready_http"
if total_timeout_sec is None:
total_timeout_sec = self.WDA_READY_TIMEOUT
@@ -387,11 +513,11 @@ class DeviceInfo:
deadline = _monotonic() + to
while _monotonic() < deadline:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(0.2)
s.settimeout(0.25)
if s.connect_ex(("127.0.0.1", port)) == 0:
LogManager.method_info(f"端口已开始监听:{port}", method, udid="system")
return True
time.sleep(0.05)
time.sleep(0.06)
LogManager.method_info(f"监听验收阶段超时:{port},扩展等待", method, udid="system")
LogManager.method_warning(f"监听验收最终超时:{port}", method, udid="system")
return False
@@ -490,6 +616,7 @@ class DeviceInfo:
LogManager.method_info(f"自愈被退避抑制,剩余 {delta}s", method, udid=udid)
return
old_port = None
with self._lock:
proc = self._procs.get(udid)
if proc:
@@ -499,6 +626,7 @@ class DeviceInfo:
if not model:
LogManager.method_warning("模型不存在,取消自愈", method, udid=udid)
return
old_port = model.screenPort
proc2 = self._start_iproxy(udid, port=None)
if not proc2:
@@ -518,26 +646,78 @@ class DeviceInfo:
model.screenPort = self._port_by_udid.get(udid, model.screenPort)
self._models[udid] = model
self._manager_send(model)
LogManager.method_info(f"[PORT-SWITCH] {udid} {old_port} -> {self._port_by_udid.get(udid)}", method, udid=udid)
LogManager.method_info(f"重启成功,使用新端口 {self._port_by_udid.get(udid)}", method, udid=udid)
# ---------------- 健康检查 ----------------
def _health_check_mjpeg(self, port: int, timeout: float = 0.8) -> bool:
def _health_check_mjpeg(self, port: int, timeout: float = 1.8) -> bool:
"""使用 GET 真实探测 MJPEG校验 Content-Type 和 boundary。尝试 /mjpeg -> /mjpegstream -> /"""
method = "_health_check_mjpeg"
try:
conn = http.client.HTTPConnection("127.0.0.1", port, timeout=timeout)
conn.request("HEAD", "/")
resp = conn.getresponse()
_ = resp.read(128)
code = getattr(resp, "status", 0)
conn.close()
return 200 <= code < 400
except Exception:
return False
paths = ["/mjpeg", "/mjpegstream", "/"]
for path in paths:
try:
conn = http.client.HTTPConnection("127.0.0.1", port, timeout=timeout)
conn.request("GET", path, headers={"Connection": "close"})
resp = conn.getresponse()
ctype = (resp.getheader("Content-Type") or "").lower()
ok_hdr = (200 <= resp.status < 300) and ("multipart/x-mixed-replace" in ctype)
# 仅读少量字节,不阻塞
chunk = resp.read(1024)
try:
conn.close()
except Exception:
pass
if ok_hdr and (b"--" in chunk):
return True
except Exception:
pass
return False
def _health_check_wda(self, udid: str) -> bool:
# 使用 HTTP 探测(带短缓存),避免触发 xctest
"""使用 HTTP 探测(带短缓存),避免触发 xctest"""
# 加一次重试,减少瞬态波动
if self._wda_is_running(udid, cache_sec=1.0):
return True
time.sleep(0.2)
return self._wda_is_running(udid, cache_sec=1.0)
def _maybe_reset_wda_lightweight(self, udid: str) -> bool:
"""在 MJPEG 多次异常但 /status 正常时,做 WDA 轻量复位。成功返回 True。"""
method = "_maybe_reset_wda_lightweight"
now = _monotonic()
last = self._last_wda_reset.get(udid, 0.0)
if now - last < self.WDA_RESET_COOLDOWN:
return False
LogManager.method_warning("MJPEG 连续异常,尝试 WDA 轻量复位", method, udid=udid)
try:
dev = tidevice.Device(udid)
# 先尝试 stop/start
try:
dev.app_stop(WdaAppBundleId)
time.sleep(1.0)
except Exception:
pass
dev.app_start(WdaAppBundleId)
# 等待就绪(缩短等待)
if self._wait_wda_ready_http(udid, total_timeout_sec=12.0):
self._last_wda_reset[udid] = _monotonic()
return True
except Exception as e:
LogManager.method_warning(f"WDA stop/start 失败:{e}", method, udid=udid)
# 兜底iOS18+ 用 IOSActivator 再尝试
try:
ios = IOSActivator()
ios.activate(udid)
if self._wait_wda_ready_http(udid, total_timeout_sec=12.0):
self._last_wda_reset[udid] = _monotonic()
return True
except Exception as e:
LogManager.method_warning(f"IOSActivator 复位失败:{e}", method, udid=udid)
return False
def _check_and_heal_tunnels(self, interval: float = 5.0):
method = "_check_and_heal_tunnels"
now = _monotonic()
@@ -557,21 +737,41 @@ class DeviceInfo:
if port <= 0:
continue
ok_local = self._health_check_mjpeg(port, timeout=0.8)
ok_local = self._health_check_mjpeg(port, timeout=1.8)
ok_wda = self._health_check_wda(udid)
LogManager.method_info(f"健康检查mjpeg={ok_local}, wda={ok_wda}, port={port}", method, udid=udid)
if not (ok_local and ok_wda):
if ok_local and ok_wda:
self._mjpeg_bad_count[udid] = 0
continue
# 分层自愈MJPEG 连续异常而 WDA 正常 → 优先复位 WDA
if (not ok_local) and ok_wda:
cnt = self._mjpeg_bad_count.get(udid, 0) + 1
self._mjpeg_bad_count[udid] = cnt
if cnt >= self.MJPEG_BAD_THRESHOLD:
if self._maybe_reset_wda_lightweight(udid):
# 复位成功后重启 iproxy确保新流映射
self._restart_iproxy(udid)
self._mjpeg_bad_count[udid] = 0
continue # 下一个设备
# 若未达门槛或复位失败,仍执行 iproxy 重启
LogManager.method_warning(f"检测到不健康触发重启port={port}", method, udid=udid)
self._restart_iproxy(udid)
continue
# ---------------- Windows/*nix列出所有 iproxy 命令行 ----------------
def _get_all_iproxy_cmdlines(self) -> List[str]:
method = "_get_all_iproxy_cmdlines"
lines: List[str] = []
with self._lock:
live_pids = set(self._pid_by_udid.values())
# 其他情况wda 不健康或两者都不健康):先重启 iproxy
LogManager.method_warning(f"检测到不健康触发重启port={port}", method, udid=udid)
self._restart_iproxy(udid)
# ---------------- 进程枚举(结构化返回) ----------------
def _get_all_iproxy_entries(self) -> List[Dict[str, Any]]:
"""
返回结构化 iproxy 进程项:
{ 'pid': int, 'name': str, 'cmdline': List[str], 'udid': str|None, 'local_port': int|None, 'remote_port': int|None }
"""
method = "_get_all_iproxy_entries"
entries: List[Dict[str, Any]] = []
is_windows = os.name == "nt"
target_name = "iproxy.exe" if is_windows else "iproxy"
@@ -580,58 +780,106 @@ class DeviceInfo:
name = (p.info.get("name") or "").lower()
if name != target_name:
continue
if p.info["pid"] in live_pids:
continue
cmdline = p.info.get("cmdline") or []
if not cmdline:
continue
udid = None
local_port = None
remote_port = None
# 解析 -u <udid> 与后续的两个端口LOCAL_PORT, REMOTE_PORT
if "-u" in cmdline:
cmd = " ".join(cmdline)
lines.append(f"{cmd} {p.info['pid']}")
try:
i = cmdline.index("-u")
if i + 1 < len(cmdline):
udid = cmdline[i + 1]
# 在 -u udid 之后扫描数字端口
ints = []
for token in cmdline[i + 2:]:
if token.isdigit():
ints.append(int(token))
# 停止条件:拿到两个
if len(ints) >= 2:
break
if len(ints) >= 2:
local_port, remote_port = ints[0], ints[1]
else:
# 兜底:全局找两个数字
ints2 = [int(t) for t in cmdline if t.isdigit()]
if len(ints2) >= 2:
local_port, remote_port = ints2[-2], ints2[-1]
except Exception:
pass
entries.append({
"pid": p.info["pid"],
"name": name,
"cmdline": cmdline,
"udid": udid,
"local_port": local_port,
"remote_port": remote_port
})
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
LogManager.method_info(f"扫描到候选 iproxy 进程数={len(lines)}", method, udid="system")
return lines
# ---------------- 杀孤儿 ----------------
LogManager.method_info(f"扫描到候选 iproxy 进程数={len(entries)}", method, udid="system")
return entries
# ---------------- 杀孤儿(含“同 UDID 的非当前实例”清理) ----------------
def _cleanup_orphan_iproxy(self):
method = "_cleanup_orphan_iproxy"
with self._lock:
live_udids = set(self._models.keys())
live_pids = set(self._pid_by_udid.values())
live_pid_by_udid = dict(self._pid_by_udid)
live_port_by_udid = dict(self._port_by_udid)
cleaned = 0
for ln in self._get_all_iproxy_cmdlines():
parts = ln.split()
try:
if "-u" not in parts:
continue
udid = parts[parts.index('-u') + 1]
pid = int(parts[-1])
if pid not in live_pids and udid not in live_udids:
self._kill_pid_gracefully(pid)
cleaned += 1
LogManager.method_warning(f"孤儿 iproxy 已清理udid={udid}, pid={pid}", method, udid=udid)
except (ValueError, IndexError):
for ent in self._get_all_iproxy_entries():
pid = ent["pid"]
udid = ent.get("udid")
local_port = ent.get("local_port")
# 完全不认识的进程(无法解析 udid跳过
if not udid:
continue
# 1) 完全孤儿udid 不在活跃设备集,且 pid 不是任何已跟踪 pid → 杀
if udid not in live_udids and pid not in live_pid_by_udid.values():
self._kill_pid_gracefully(pid, silent=True)
cleaned += 1
LogManager.method_info(f"孤儿 iproxy 已清理udid={udid}, pid={pid}", method)
continue
# 2) 同 UDID 的非当前实例udid 活跃,但 pid != 当前 pid且本地端口也不是当前端口 → 杀
live_pid = live_pid_by_udid.get(udid)
live_port = live_port_by_udid.get(udid)
if udid in live_udids and pid != live_pid:
if (local_port is None) or (live_port is None) or (local_port != live_port):
self._kill_pid_gracefully(pid, silent=True)
cleaned += 1
LogManager.method_info(f"清理同UDID旧实例udid={udid}, pid={pid}, local_port={local_port}", method)
if cleaned:
LogManager.method_info(f"孤儿清理完成,数量={cleaned}", method, udid="system")
LogManager.method_info(f"孤儿清理完成,数量={cleaned}", method)
# ---------------- 按 PID 强杀 ----------------
def _kill_pid_gracefully(self, pid: int):
method = "_kill_pid_gracefully"
def _kill_pid_gracefully(self, pid: int, silent: bool = False):
"""优雅地结束进程不弹出cmd窗口"""
try:
p = psutil.Process(pid)
p.terminate()
try:
p.wait(timeout=1.0)
LogManager.method_info(f"进程已终止pid={pid}", method, udid="system")
except psutil.TimeoutExpired:
p.kill()
LogManager.method_warning(f"进程被强制 killpid={pid}", method, udid="system")
if platform.system() == "Windows":
# 不弹窗方式
subprocess.run(
["taskkill", "/PID", str(pid), "/F", "/T"],
stdout=subprocess.DEVNULL if silent else None,
stderr=subprocess.DEVNULL if silent else None,
creationflags=getattr(subprocess, "CREATE_NO_WINDOW", 0x08000000),
)
else:
# Linux / macOS
os.kill(pid, signal.SIGTERM)
except Exception as e:
LogManager.method_warning(f"kill 进程异常pid={pid}, err={e}", method, udid="system")
LogManager.method_error(f"结束进程 {pid} 失败: {e}", "_kill_pid_gracefully")
# ---------------- 端口工具(兜底) ----------------
def _pick_free_port(self, start: int = None, limit: int = 2000) -> int: