Uploaded image for project: 'JDK'
  1. JDK
  2. JDK-4178050

hpi needs to handle EINTR (Socket reads and writes randomly throw InterruptedIOE

XMLWordPrintable

    • Icon: Bug Bug
    • Resolution: Fixed
    • Icon: P3 P3
    • 1.3.1_07
    • 1.2.0, 1.2.1_004, 1.2.2_005, 1.3.0, 1.3.1, 1.3.1_04, 1.4.0
    • hotspot
    • 07
    • x86, sparc
    • solaris_2.6, solaris_7, solaris_8, windows_nt

        11 EAGAIN
            accept(260, 0x00657A88, 0xFA501BE4, 1) = 19
            accept(260, 0x0068B410, 0xFA3E1BE4, 1) Err#11 EAGAIN
            accept(260, 0x0068B410, 0xFA3E1BE4, 1) = 27
            accept(260, 0x00713868, 0xFA0E1BE4, 1) Err#11 EAGAIN
        Received signal #17, SIGUSR2, in accept() [caught]
            accept(18, 0xFDA2167C, 0xFDA2168C, 1) Err#91 ERESTART
            accept(18, 0xFDA2167C, 0xFDA2168C, 1) = 31
            accept(260, 0x00713868, 0xFA0E1BE4, 1) = 19
            accept(260, 0x00657A88, 0xFA501BE4, 1) Err#11 EAGAIN
            accept(260, 0x00657A88, 0xFA501BE4, 1) = 27
            accept(260, 0x005E8898, 0xFB3C1BE4, 1) Err#11 EAGAIN
            accept(260, 0x005E8898, 0xFB3C1BE4, 1) = 36
            accept(260, 0x005F1788, 0xFB391BE4, 1) Err#11 EAGAIN
        Received signal #17, SIGUSR2, in accept() [caught]
            accept(18, 0xFDA2167C, 0xFDA2168C, 1) Err#91 ERESTART
            accept(18, 0xFDA2167C, 0xFDA2168C, 1) = 32
            accept(260, 0x005F1788, 0xFB391BE4, 1) = 19
            accept(260, 0x006E06E0, 0xFA201BE4, 1) Err#11 EAGAIN
            accept(18, 0xFDA2167C, 0xFDA2168C, 1) = 33
        Received signal #17, SIGUSR2, in accept() [caught]
            accept(18, 0xFDA2167C, 0xFDA2168C, 1) Err#91 ERESTART

        I suppose it would be acceptable for an _lwp_kill() to interrupt blocking
        i/o in the thread it is sending a signal to, but the question is, if this
        is the same accept() we're having trouble with, why is the signal being
        sent in the first place?

        Here are actual, typical unedited sections of the truss output, regarding
        how the SIGUSR2 signals are being generated and received:

          1709 lwp_cond_signal(0xFEAE2000) = 0
          1710 lwp_cond_wait(0xFEAE2000, 0xFEAE2010, 0x00000000) = 0
          1711 lwp_mutex_lock(0xFEAE2010) = 0
          1712 read(32, 0xF9C80E70, 2048) = 0
          1713 yield() = 0
          1714 lwp_kill(7, SIGUSR2) = 0
          1715 Received signal #17, SIGUSR2, in accept() [caught]
          1716 siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
          1717 accept(18, 0xFDA2167C, 0xFDA2168C, 1) Err#91 ERESTART

          1726 sysconfig(_CONFIG_SIGRT_MIN) = 38
          1727 sigprocmask(SIG_SETMASK, 0xFECCD520, 0x00000000) = 0
          1728 sigprocmask(SIG_SETMASK, 0xFECD6488, 0xFFBEEFF8) = 0
          1729 lwp_kill(16, SIGUSR2) = 0
          1730 lwp_cond_wait(0xFEAE6930, 0xFEAE6940, 0xFDAA1A88) = 0
          1731 sigprocmask(SIG_SETMASK, 0xFFBEEFF8, 0x00000000) = 0
          1732 Received signal #17, SIGUSR2 [caught]
          1733 siginfo: SIGUSR2 pid=7679 uid=3001 code=-1

         39720 lwp_cond_signal(0xFEAE2000) = 0
         39721 lwp_cond_wait(0xFEAE2000, 0xFEAE2010, 0x00000000) = 0
         39722 lwp_mutex_lock(0xFEAE2010) = 0
         39723 yield() = 0
         39724 lwp_kill(7, SIGUSR2) = 0
         39725 Received signal #17, SIGUSR2, in accept() [caught]
         39726 siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
         39727 accept(18, 0xFDA2167C, 0xFDA2168C, 1) Err#91 ERESTART

         39736 lwp_mutex_lock(0xFECD0C58) = 0
         39737 sigprocmask(SIG_SETMASK, 0xFECD6488, 0xFFBEEFF8) = 0
         39738 lwp_kill(6, SIGUSR2) = 0
         39739 sigprocmask(SIG_SETMASK, 0xFFBEEFF8, 0x00000000) = 0
         39740 Received signal #17, SIGUSR2 [caught]
         39741 siginfo: SIGUSR2 pid=7679 uid=3001 code=-1

        148267 lwp_mutex_lock(0xFECD0C58) = 0
        148268 read(35, " t", 1) = 1
        148269 accept(260, 0x005F1788, 0xFB391BE4, 1) Err#11 EAGAIN
        148270 read(35, "\0", 1) = 1
        148271 read(36, " P O S T / s e r v l e".., 8192) = 7503

        156895 yield() = 0
        156896 lwp_cond_signal(0xFEAE6980) = 0
        156897 lwp_mutex_lock(0xFEAE6990) = 0
        156898 yield() = 0
        156899 lwp_kill(19, SIGUSR2) = 0
        156900 Received signal #17, SIGUSR2 [caught]
        156901 siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
        156902 lwp_kill(7, SIGUSR2) = 0
        156903 Received signal #17, SIGUSR2, in accept() [caught]
        156904 siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
        156905 accept(18, 0xFDA2167C, 0xFDA2168C, 1) Err#91 ERESTART
        156906 lwp_mutex_lock(0xFECD0C58) = 0
        156907 lwp_mutex_wakeup(0xFECD0C58) = 0
        156908 sysconfig(_CONFIG_SIGRT_MIN) = 38

        187134 lwp_sema_post(0xFC471E78) = 0
        187135 lwp_sema_wait(0xFC471E78) = 0
        187136 lwp_mutex_lock(0xFECD0C58) = 0
        187137 lwp_mutex_wakeup(0xFECD0C58) = 0
        187138 accept(260, 0x006E06E0, 0xFA201BE4, 1) Err#11 EAGAIN
        187139 read(19, " P O S T / s e r v l e".., 8192) = 294

        "man -s3xn accept" yields this partial information:

            ERRORS
        The accept() function will fail if:
             
        EAGAIN
             
        EWOULDBLOCK
        O_NONBLOCK is set for the socket file descriptor
        and no connections are present to be accepted.

        It's not clear if this is meant to say that the condition described might
        sometimes return EAGAIN and sometimes EWOULDBLOCK, or whether the man page
        is missing a proper description of what causes the EAGAIN error code (i.e.,
        another Solaris bug, this one in the documentation).

        Now, there's one other curiosity. Even though Java is reporting an
        interrupted system call to our code, there is no evidence of this in the
        truss output. That is to say, I would expect to see places where some of
        the accept() system calls return EINTR, not the EAGAIN and ERESTART values
        that we see above. Perhaps Sun/JavaSoft could verify that these error
        codes are being turned into an exception and an error message that claims
        an interrupted system call.

        --------------------------------------------------------------------------

        Hopefully, the information above will give Sun enough clues to find
        the problem.

        Glenn



        There's a long history to this bug so I'll summarize the issue
        here :-

        1. If an application has JNI code that uses fork then all threads
        in the VM that are blocked in system calls will get a EINTR.

        2. Although (1) should be strongly discouraged it seems that this
        crops up without applications explicitly using fork -- eg: a
        servlet engine running in the same process at the http server.
        If the http server is also forking cgi scripts it's possible
        for the threads in the servlet engine to also get interrupted.

        3. The HPI/JVM api returns EINTR to indicate that a thread has
        been interrupted. However if eintr is returned from a system
        call it gets propogated up to the class libraries and we
        can't distinguish thread.interrupt from interrupted system
        call.

        4. For java.net we require that all the blocking HPI/JVM
        are automatically restarted if interrupted. Currently on
        Solaris (merlin builds) only hpi::accept handles the
        EINTR return code.

        5. I've attached eintr.tar.Z to this bug to duplicate the
        issue. bld.sh compiles the class and creates the shared
        library (need to edit bld.sh to put in your JAVA_HOME
        variable).

        alan.bateman@ireland 2001-02-12

        ---



        Name: mr33420 Date: 10/01/98

        =20
        This problem was orginally reported for 1.2beta3. The response I had from =
        you was that it was fixed in 1.2beta4 so I waited. Now I find that the pro=
        blem persists in 1.2beta4. My original report and the response from Tim Be=
        ll is shown below along with a followup message to Tim for which I have had=
         no response. Therefore I'm resubmitting this as a bug report.

        I have now installed 1.2 beta 4 on Solaris 2.5.1 with all the specified pat=
        ches. Running=20
        native threads, the originally reported problem is still occurring. One ad=
        ditional=20
        apparent correlation is that the threads which are reading (or, less freque=
        ntly, writing)=20
        from sockets seem to get the InterruptedIOException's when another thread i=
        s using=20
        Runtime.exec() to start an external process.

        I have been able to get the application running using green threads instead=
         of native=20
        threads. This seems to eliminate the InterruptedIOException's but introduc=
        es a new=20
        problem. I now seem to randomly get IOException: Resource Temporarily=20
        unavailable. What does this mean? Is there anything meaningful I could do=
         when this=20
        happens on a read? On a write?

        With either native or green threads the application randomly (so it seems) =
        core=20
        dumps. Since I'm not familiar with either the debugger or the Solaris VM i=
        nternals I=20
        have no idea what to do with this situation. Since the application is supp=
        osed to be a=20
        24-hour, non-stop application this isn't cool.

        Native thread execution is prefered since this allows us to scale capacity =
        on the=20
        hosting system by adding CPU's.

        I'm now running the application on a new system built with Solaris 2.6. I =
        do not know if any of these issues=20
        might be related to the OS version. The InterruptedIOException 's are occu=
        rring just as they do on Solaris 2.5.1.
        In addition, I'm now getting another exception, though this is probably rel=
        ated:

        java.lang.NoClassDefFoundError: java/lang/InterruptedIOException
          at java.net.SocketInputStream.socketRead(Native Method)
          at java.net.SocketInputStream.read(Compiled Code)
          .
          .
          .

        Any direction on how I can clear up these issues would be appreciated.

        Thanks,
        Richard Feezel


        On Wed, 17 Jun 1998 16:31:43 -0700, Tim Bell wrote:

        >Hello
        >
        >This looks like a case of Bug ID 4125230
        > Synopsis: 1.2 native threads: interruptible I/O doesn't clear interruted =
        bit
        >
        >
        >The Java(sm) Developer's Connection(sm) (JDC) is a free
        >channel that is maintained by staff here at Sun. Access
        >this web page to join:
        >
        > http://developer.javasoft.com/servlet/RegisterServlet
        >
        >Feel free to check the status of Java(tm) bugs via the JDC at:
        > http://developer.javasoft.com/developer/techDocs/knowledgebase.html
        >
        > "Java(SM) Developer Connection(SM) is a service mark of
        > Sun Microsystems, Inc."
        > "Java Developer Connection est une marque de fabrique ou
        > une marque d =E9pos =E9e de Sun Microsystems, Inc. aux
        > Etats-Unis et dans d'autres pays."
        >
        >If this is not the correct diagnosis in your case, please
        >reply to this message with a short .java program that
        >demonstrates your error, and I'll work on duplicating
        >it here.
        >
        >
        >Best Regards-
        > Tim Bell ###@###.### Source Integration Engineering
        > Java Software Division of Sun Microsystems, Inc.
        >
        > "JavaSoft(TM), Java(TM), and Java(TM) Development Kit
        > are all trademarks of Sun Microsystems, Inc."
        > "JavaSoft(TM), Java(TM), et Java(TM) Development Kit
        > sont des marques d =E9pos =E9es ou enregistr =E9es de Sun
        > Microsystems, Inc. aux Etats-Unis et dans d'autres pays."
        >
        >----------------- Original Bug Report-------------------
        >
        >id : 33830
        >category : java
        >subcategory : classes_io
        >type : bug
        >synopsis : Randon IOExceptions reporting Interrupted system call
        >description : I have a TCP socket server application which uses
        >separate threads for reading and writing a socket
        >which was created by a ServerSocket.accept(). I'm
        >running JDK 1.2beta3 using native threads on=20
        >Solaris 2.5.1 with all the necessary patches for=20
        >native threads. Since a separate thread is used=20
        >for reading from the socket there is a read=20
        >pending on the socket almost all the time. When=20
        >the server decides to terminate the connection it=20
        >uses the Thread.interrupt() method to get the=20
        >reading thread out of the pending read. Since=20
        >this wasn't implemented in JDK 1.1 I'm forced to=20
        >use 1.2.
        >
        >The problem I'm having is that the VM seems to=20
        >throw IOExceptions with the message=20
        >"Interrupted system call" at my threads making=20
        >both read and write requests seemingly randomly!
        >Since I have reads pending on all my open sockets
        >most of the time, reads most often receive these!
        >When I investigated I found that these are not=20
        >being caused by my code calling=20
        >Thread.interrupt(). They seem to correlate with
        >the ServerSocket.accept() accepting a new=20
        >connection. When these occur on a write operation
        >the write seems to be duplicated (the data is sent
        >twice). These seem to occur in bursts often=20
        >affecting several threads and/or multiple times on
        >a single thread.
        >
        >Is it really meaningfull for these to be passed to=20
        >the application at all? I do expect to get an=20
        >InterruptedIOException when I use the=20
        >Thread.interrupt() to terminate the read but this=20
        >looks broken to me.
        >comments : (company - EarthLink Network , email - ###@###.###)
        >workaround :=20
        >cust_name : Richard M. Feezel
        >cust_email : ###@###.###
        >company : EarthLink Network
        >release : 1.2beta3
        >hardware : sun4u
        >OSversion : sol2.6
        >status : Deleted
        >delReason : Duplicate
        >priority : 4
        >sev_impact : 2
        >sev_function : 2
        >cust_type : R
        >bugtraqID : 4125230
        >dateCreated : 1998-06-17 15:01:00.0
        >dateEvaluated : 1998-6-17 16:30:54
        (Review ID: 36380)
        ======================================================================


        4/18/00
        ANOTHER Customer Sees this problem.
        Socket reads fail when using Solaris VM (build Solaris_JDK_1.2.1_04, native
        threads, sunwjit), there is an Interrupted System Call exception being
        thrown.

        We are running Netscape Enterprise Server(iPlanet) 4.0 with ServletExecNSAPI 2.2
        as our servlet engine.

        Referring to the Solaris 7 Multi threaded Programming Guide (of sun
        soft) - it
        said the following: Pg 115 Programming with the operating environment
            "Note that when one thread in a process calls Solaris fork(2),
        threads that are blocked in an interruptible system call return EINTR".

        We suspect that when our threads in the Java VM are waiting on these
        socket reads (blocking), when the solaris os forks new threads, all other
        blocked threads throw an interruptible system call exception and so the read
        over the socket fails. This happens only on Solaris VM (build Solaris_JDK_1.2.1_04, native threads, sunwjit), when we back out to JDK 1.2.1_03 it works fine.

        We have discovered no means (yet) to reproduce this in a context other than
        our full application, although it is certainly very easy to replicate in that
        context.
        --------------------------------------------------------------------------

        Note that while the Multithreaded Programming Guide claims a process should use
        fork1() instead of fork() to prevent this problem of interrupted system calls,
        the fork()/fork1() man page claims that either system call (i.e., fork1() as
        well) will have the same effect. Which part of this conflicting documentation
        is correct?

        --------------------------------------------------------------------------

        The only other thing I can think of to do at our end is to attach truss to the
        ns-httpd process, and see what system calls it is making at the time we get the
        interrupted system calls. All that will do is verify whether it's fork() or
        fork1() that's causing the problem. It still won't tell us what piece of
        iPlanet, ServletExec, or the JVM is making this system call.
        --------------------------------------------------------------------------

        Okay, I just took the trouble to run truss on the ns-httpd process. I see
        in the ServletExec.log file that we had a number of interrupted system calls.
        But I see *no* fork() or fork1() calls in the truss output! What I do see
        are a number of issues dealing with signals.

            # fgrep sig syscalls | sort -u
        siginfo: SIGLWP pid=7679 uid=3001 code=-1
        siginfo: SIGSEGV SEGV_MAPERR addr=0x00000008
        siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
        Received signal #11, SIGSEGV [caught]
        Received signal #14, SIGALRM, in lwp_sema_wait() [caught]
        Received signal #17, SIGUSR2 [caught]
        Received signal #17, SIGUSR2, in accept() [caught]
        Received signal #33, SIGLWP [caught]
            lwp_cond_signal(0x0016CBA8) = 0
            lwp_cond_signal(0x0026DDF8) = 0
            lwp_cond_signal(0x004EE968) = 0
            lwp_cond_signal(0x0052E718) = 0
            lwp_cond_signal(0x00CBE5F8) = 0
            lwp_cond_signal(0x010D6510) = 0
            lwp_cond_signal(0x0111A358) = 0
            lwp_cond_signal(0xFEAE2000) = 0
            lwp_cond_signal(0xFEAE6930) = 0
            lwp_cond_signal(0xFEAE6980) = 0
            lwp_cond_signal(0xFECD0C48) = 0
            lwp_sigredirect(0, SIGPIPE) = 0
            signotifywait() = 13
            sigprocmask(SIG_BLOCK, 0xFECCDEF8, 0x00000000) = 0
            sigprocmask(SIG_SETMASK, 0xFE20BE2C, 0x00000000) = 0
            sigprocmask(SIG_SETMASK, 0xFECCD520, 0x00000000) = 0
            sigprocmask(SIG_SETMASK, 0xFECD6488, 0xFFBEEFF8) = 0
            sigprocmask(SIG_SETMASK, 0xFFBEEFF8, 0x00000000) = 0
            sigprocmask(SIG_UNBLOCK, 0xFECCDEF8, 0x00000000) = 0

        That SIGSEGV looks awfully bad, to start with; this is the first time we've
        seen that something serious is happening, as there is no evidence of this in
        the iPlanet log files, the ServletExec log files, or our own servlet log file.
        But there is only one instance of that in the truss output, and there are many
        instances of interrupted system calls. The SIGUSR2 signal is happening with
        something much closer to the number of interrupted system calls. Searching
        for that in the truss output yields:

            % fgrep SIGUSR syscalls
            lwp_kill(7, SIGUSR2) = 0
        Received signal #17, SIGUSR2, in accept() [caught]
        siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
            lwp_kill(16, SIGUSR2) = 0
        Received signal #17, SIGUSR2 [caught]
        siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
            lwp_kill(7, SIGUSR2) = 0
        Received signal #17, SIGUSR2, in accept() [caught]
        siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
            lwp_kill(6, SIGUSR2) = 0
        Received signal #17, SIGUSR2 [caught]
        siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
            lwp_kill(7, SIGUSR2) = 0
        Received signal #17, SIGUSR2, in accept() [caught]
        siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
            lwp_kill(20, SIGUSR2) = 0
        Received signal #17, SIGUSR2 [caught]
        siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
            lwp_kill(19, SIGUSR2) = 0
        Received signal #17, SIGUSR2 [caught]
        siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
            lwp_kill(7, SIGUSR2) = 0
        Received signal #17, SIGUSR2, in accept() [caught]
        siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
            lwp_kill(20, SIGUSR2) = 0
        Received signal #17, SIGUSR2 [caught]
        siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
            lwp_kill(7, SIGUSR2) = 0
        Received signal #17, SIGUSR2, in accept() [caught]
        siginfo: SIGUSR2 pid=7679 uid=3001 code=-1
            lwp_kill(22, SIGUSR2) = 0
        Received signal #17, SIGUSR2 [caught]
        siginfo: SIGUSR2 pid=7679 uid=3001 code=-1

        Hmmm. There is no man page for lwp_kill(), even though truss is only recording
        system calls. Okay, a further search shows a system call named _lwp_kill();
        clearly, truss has a bug in that it stripped the leading underscore. The man
        page says nothing about this system call doing anything to interrupt blocking
        i/o calls; is this another Solaris bug? Anyway, Peter tells me that the interrupted
        system calls that he is detecting are accept(), which matches something of what
        we see here. Looking for such calls in the truss output yields:

            accept(260, 0x00757A68, 0xF9F61BE4, 1) = 19
            accept(260, 0x007BE838, 0xF9EA1BE4, 1) Err#11 EAGAIN
            accept(260, 0x007BE838, 0xF9EA1BE4, 1) = 19
            accept(260, 0x00624908, 0xFB271BE4, 1) Err#11 EAGAIN
            accept(260, 0x00624908, 0xFB271BE4, 1) = 19
            accept(260, 0x00657A88, 0xFA501BE4, 1) Err#11 EAGAIN
            accept(18, 0xFDA2167C, 0xFDA2168C, 1) = 32
        Received signal #17, SIGUSR2, in accept() [caught]
            accept(18, 0xFDA2167C, 0xFDA2168C, 1) Err#91 ERESTART
            accept(260, 0x00657A88, 0xFA501BE4, 1) = 19
            accept(260, 0x00713868, 0xFA0E1BE4, 1) Err#11 EAGAIN
            accept(18, 0xFDA2167C, 0xFDA2168C, 1) = 27
            accept(260, 0x00713868, 0xFA0E1BE4, 1) = 19
            accept(260, 0x0068B410, 0xFA3E1BE4, 1) = 32
            accept(260, 0x006E8F20, 0xFA1D1BE4, 1) Err#11 EAGAIN
            accept(260, 0x006E8F20, 0xFA1D1BE4, 1) = 33
            accept(260, 0x006AD538, 0xFA321BE4, 1) Err#11 EAGAIN
            accept(260, 0x006AD538, 0xFA321BE4, 1) = 33
            accept(260, 0x007BE838, 0xF9EA1BE4, 1) Err#11 EAGAIN
        Received signal #17, SIGUSR2, in accept() [caught]
            accept(18, 0xFDA2167C, 0xFDA2168C, 1) Err#91 ERESTART
            accept(260, 0x007BE838, 0xF9EA1BE4, 1) = 27
            accept(260, 0x00657A88, 0xFA501BE4, 1) Err#

              psoper Pete Soper (Inactive)
              mr Mark Reinhold
              Votes:
              0 Vote for this issue
              Watchers:
              1 Start watching this issue

                Created:
                Updated:
                Resolved:
                Imported:
                Indexed: