时间:2021-07-01 10:21:17 帮助过:3人阅读
这次的问题相比西安研发中心曾经碰上的Python信号处理问题,有不少基础知识、先验知识是共用的,此处不做再普及,感兴趣的同学可以翻看我以前发过的文章。
下文是一次具体的调试、分析记录。为了简化现场、方便调试,已将原始问题、衍生问题浓缩成DebugPythonWithGDB_6.py、DebugPythonWithGDB_7.py。
$ vi DebugPythonWithGDB_6.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | #!/usr/bin/env python # -*- encoding: utf-8 -*- import sys,os,signal,subprocess,shlex,traceback def on_SIGCHLD(signum,frame): print"[on_SIGCHLD" sys.stdout.write("signum = %u\n"%signum) traceback.print_stack(frame) print os.waitpid(-1,os.WNOHANG) """ try : print os.waitpid( -1, os.WNOHANG ) except OSError : sys.stdout.write( ‘Line[%u]: OSError\n‘ % sys.exc_info()[2].tb_lineno ) """ print"on_SIGCHLD]" def do_more(count): print‘[do_more() begin %u]‘%count os.system(r‘printf "Child = %u\n" $$;/bin/sleep 1‘) """ # # 这里存在竞争条件,可以增加触发OSError异常的概率 # os.system( r‘printf "Child =%u\n" $$;/bin/sleep 1‘ ) os.system( r‘printf "Child =%u\n" $$;/bin/sleep 1‘ ) os.system( r‘printf "Child =%u\n" $$;/bin/sleep 1‘ ) os.system( r‘printf "Child =%u\n" $$;/bin/sleep 1‘ ) """ print‘[do_more() end %u]‘%count def main(prog,args): if0==len(args): print‘Usage: %s ‘%prog else: sys.stdout.write("Parent = %u\n"%os.getpid()) # # 本例中,即使有下列代码,Ctrl-C仍然无效。 # signal.signal(signal.SIGINT,signal.SIG_DFL) # # signal.signal( signal.SIGCHLD, signal.SIG_IGN ) # signal.signal(signal.SIGCHLD,on_SIGCHLD) # count =0 whileTrue: # # 本例中父进程只是一个调度框架,不需要与子进程进行通信,因此不 # 需要特别处理"stdin=None, stdout=None, stderr=None"。 # child =subprocess.Popen \ ( # # 不要直接用args[0].split(),它在处理单、双引号时不是我们 # 期望的行为。考虑这种例子,ls -l "/tmp/non exist" # shlex.split(args[0]), # # all file descriptors except 0, 1 and 2 will be closed # before the child process is executed # close_fds =True, cwd ="/tmp" ) sys.stdout.write("Child = %u\n"%child.pid) # # child.send_signal( signal.SIGTERM ) # child.terminate() # child.kill() # # child.wait() # do_more(count) count +=1 if‘__main__‘==__name__: try: main(os.path.basename(sys.argv[0]),sys.argv[1:]) except KeyboardInterrupt: pass |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | $python DebugPythonWithGDB_6.py‘python -c "import time;time.sleep(3600)"‘ Parent =10244 Child =10245 [do_more()begin0] [on_SIGCHLD signum =17 File"DebugPythonWithGDB_6.py",line81,in main(os.path.basename(sys.argv[0]),sys.argv[1:]) File"DebugPythonWithGDB_6.py",line76,inmain do_more(count) File"DebugPythonWithGDB_6.py",line20,indo_more print‘[do_more() begin %u]‘%count (10245,9) on_SIGCHLD] Child =10246 [on_SIGCHLD signum =17 File"DebugPythonWithGDB_6.py",line81,in main(os.path.basename(sys.argv[0]),sys.argv[1:]) File"DebugPythonWithGDB_6.py",line76,inmain do_more(count) File"DebugPythonWithGDB_6.py",line21,indo_more os.system(r‘printf "Child = %u\n" $$;/bin/sleep 1‘) Traceback(most recent call last): File"DebugPythonWithGDB_6.py",line81,in main(os.path.basename(sys.argv[0]),sys.argv[1:]) File"DebugPythonWithGDB_6.py",line76,inmain do_more(count) File"DebugPythonWithGDB_6.py",line21,indo_more os.system(r‘printf "Child = %u\n" $$;/bin/sleep 1‘) File"DebugPythonWithGDB_6.py",line10,inon_SIGCHLD print os.waitpid(-1,os.WNOHANG) OSError:[Errno10]No childprocesses |
流程进入on_SIGCHLD(),但os.waitpid()抛出OSError异常。帮助里写的是,如果系统调用waitpid()返回-1,就抛出异常: An OSError is raised with the value of errno when the syscall returns -1. 10245号子进程在on_SIGCHLD()里waitpid()成功,(10245, 9)中的9表示该进程是被SIGKILL干掉的,符合预期。 10246号子进程是do_more()里的os.system()产生的shell进程,它结束时向10244号父进程投递了SIGCHLD信号。on_SIGCHLD()里waitpid()时,已经在别处wait*()过,10246号子进程已经彻底消失,系统调用waitpid()返回-1,Python函数os.waitpid()抛出异常。 整个过程非常复杂,用伪代码描述如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | do_more() os.system() posix_system() // posixmodule.c __libc_system() // weak_alias (__libc_system, system) do_system() // sysdeps/posix/system.c /* * SIG_IGN * * Ctrl-C暂时失效 */ sigaction(SIGINT,&sa,&intr) /* * 屏蔽(阻塞)SIGCHLD信号 */ sigaddset(&sa.sa_mask,SIGCHLD) sigprocmask(SIG_BLOCK,&sa.sa_mask,&omask) fork() 子进程(10246号子进程) /* * 恢复原有SIGINT信号处理方式 */ sigaction(SIGINT,&intr,(structsigaction *)NULL) /* * 调用"sh -c ..." */ execve() [shell子进程结束,向DebugPythonWithGDB_6.py投递SIGCHLD] [由于SIGCHLD信号已被屏蔽(阻塞),其保持在内核态的未决信号链上] 父进程(10244号父进程) /* * 同步调用,会阻塞。不是在信号句柄中异步调用。 * * 10246号子进程在此被wait*()回收后彻底消失 */ waitpid(pid,&status,0) /* * 恢复原有SIGINT信号处理方式 */ sigaction(SIGINT,&intr,(structsigaction *)NULL) /* * 取消对SIGCHLD的屏蔽(阻塞) */ sigprocmask(SIG_SETMASK,&omask,(sigset_t *)NULL) [SIGCHLD信号的屏蔽(阻塞)被取消] [DebugPythonWithGDB_6.py的C级信号句柄signal_handler()安排"延迟调用"后返回] [DebugPythonWithGDB_6.py的on_SIGCHLD()此时并未得到执行,因为built-in函数os.system()尚未返回] /* * built-in函数os.system()返回后,10244号父进程开始处理"延迟调用",调用 * Python级信号句柄。这个SIGCHLD信号是10246号子进程投递过来的。 * * DebugPythonWithGDB_6.py的on_SIGCHLD()得到执行 */ on_SIGCHLD() /* * 调用waitpid( -1, &status, WNOHANG ),试图处理10246号子进程。 * * 10246号子进程已为前述waitpid( pid, &status, 0 )所处理,此处系统调用 * 返回-1,导致os.waitpid()抛出OSError异常。 */ os.waitpid(-1,os.WNOHANG) |
整个过程之所以如此复杂,主要是因为Python的信号处理机制比较复杂,让已经非常复杂的Linux信号机制再添变数。参看:
1 2 | 《2.50对Python解释器进行调试》 《22.0Linux信号机制》 |
就本例而言,为了确保DebugPythonWithGDB_6.py不因OSError异常而终止,只需在on_SIGCHLD()中调用os.waitpid()时捕捉OSError异常:
1 2 3 4 5 | def on_SIGCHLD(signum,frame): try: print os.waitpid(-1,os.WNOHANG) except OSError: sys.stdout.write(‘Line[%u]: OSError\n‘%sys.exc_info()[2].tb_lineno) |
前述观点有些是动态调试得到,有些是静态分析得到。有人可能问了,为什么不拦截Python进程的C级信号句柄,查看SIGCHLD信号源,以此确认10246号子进程可能被回收两次?其实我最初也想这么干来着,但这是行不通的,因为Python的C级信号句柄signal_handler()是那种最原始的单形参信号句柄,不是高大上的三形参信号句柄。 用GDB调试Python解释器:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | # gdb -q -ex "b *signal_handler" -ex r --args /usr/bin/python2.7-dbg DebugPythonWithGDB_6.py ‘/usr/bin/python2.7-dbg -c "import time;time.sleep(3600)"‘ ... Breakpoint1at0x8216f2d:file../Modules/signalmodule.c,line185. Starting program:/usr/bin/python2.7-dbg DebugPythonWithGDB_6.py/usr/bin/python2.7-dbg\-c\\"import\time\;time.sleep\(3600\)\" [Thread debugging using libthread_db enabled] Using host libthread_db library"/lib/i386-linux-gnu/i686/cmov/libthread_db.so.1". Parent =10284 Child =10288 [do_more()begin0] Child =10289 Breakpoint1,signal_handler(sig_num=17)at../Modules/signalmodule.c:185 185 { (gdb)py-bt #10 Frame 0xb7c20034, for file DebugPythonWithGDB_6.py, line 21, in do_more (count=0) os.system(r‘printf "Child = %u\n" $$;/bin/sleep 1‘) #13 Frame 0xb7cb37dc, for file DebugPythonWithGDB_6.py, line 76, in main (prog=‘DebugPythonWithGDB_6.py‘, args=[‘/usr/bin/python2.7-dbg -c "import time;time.sleep(3600)"‘], count=0, child=) do_more(count) #16 Frame 0xb7cbe49c, for file DebugPythonWithGDB_6.py, line 81, in () main(os.path.basename(sys.argv[0]),sys.argv[1:]) (gdb)bt7 #0 signal_handler (sig_num=17) at ../Modules/signalmodule.c:185 #1 #2 0xb7fdcd3c in __kernel_vsyscall () #3 0xb7db25eb in __sigprocmask (how=how@entry=2, set=0x0, set@entry=0xbffff0d4, oset=oset@entry=0x0) at ../sysdeps/unix/sysv/linux/sigprocmask.c:57 #4 0xb7dc2084 in do_system (line=line@entry=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at ../sysdeps/posix/system.c:161 #5 0xb7dc2380 in __libc_system (line=line@entry=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at ../sysdeps/posix/system.c:184 #6 0xb7fa9bfb in system (line=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at pt-system.c:28 (More stack frames follow...) |
查看#4的system.c:161,这个位置已经在waitpid( pid, &status, 0 )之后: sigprocmask( SIG_SETMASK, &omask, (sigset_t *)NULL ) 其作用是取消对SIGCHLD的屏蔽(阻塞)。 此时内存布局如下:
1 2 3 4 5 6 7 8 9 10 11 12 | 内存高址方向 fpstate // ESP+0x2DC output/x *(struct _fpstate *)($esp+0x2dc) retcode // ESP+0x2D4 x/3i $esp+0x2d4 extramask // ESP+0x2D0 x/1wx $esp+0x2d0 fpstate_unused // ESP+0x60 output/x *(struct _fpstate *)($esp+0x60) sigcontext_ia32// ESP+8 output/x *(struct sigcontext *)($esp+8) sig // ESP+4 信号值,信号句柄第一形参 pretcode // ESP RetAddr=__kernel_sigreturn // hexdump $esp 0x2dc 内存低址方向 |