svn commit: r224298 - projects/portbuild/scripts
Mark Linimon
linimon at FreeBSD.org
Sun Jul 24 23:02:25 UTC 2011
Author: linimon (doc,ports committer)
Date: Sun Jul 24 23:02:24 2011
New Revision: 224298
URL: http://svn.freebsd.org/changeset/base/224298
Log:
Add a great deal of error handling. A specific case that causes pollmachine
to go catatonic seems to be the "disk full" condition.
Modified:
projects/portbuild/scripts/pollmachine
Modified: projects/portbuild/scripts/pollmachine
==============================================================================
--- projects/portbuild/scripts/pollmachine Sun Jul 24 20:09:42 2011 (r224297)
+++ projects/portbuild/scripts/pollmachine Sun Jul 24 23:02:24 2011 (r224298)
@@ -2,7 +2,7 @@
#
# pollmachine
#
-# Monitors build machines and notifies qmgr of changes
+# Monitors build machines and notifies qmanager of changes
#
# pollmachine [options] [arch] ...
@@ -16,7 +16,7 @@
#
# TODO:
-# XXX qmgr notification of new/removed machines
+# XXX qmanager notification of new/removed machines
# XXX counter before declaring a machine as dead
# Declares a machine as online if it reports 0 data from infoseek?
@@ -28,6 +28,10 @@ import sys, threading, socket
from time import sleep
import os, subprocess, logging
+EXPECTED_LINES = 6
+
+DEBUG=False
+
pbc = os.getenv('PORTBUILD_CHECKOUT') \
if os.getenv('PORTBUILD_CHECKOUT') else "/var/portbuild"
pbd = os.getenv('PORTBUILD_DATA') \
@@ -122,25 +126,42 @@ class MachinePoll(threading.Thread):
try:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(60)
- s.connect((self.host, self.port))
+ retval = s.connect_ex((self.host, self.port))
+ if retval != 0:
+ if self.online:
+ logging.info("[%s] Connection error: %s" % (self.mach, `retval`))
+ self.timeouts += 1
+ else:
+ if DEBUG:
+ logging.info("%s connected to socket for %s" % ( str(self), self.mach ))
- data = ""
- while len(data) < 65536:
- chunk = s.recv(8192)
- if not chunk:
- break
- data += chunk
-
- nowonline = True
- self.timeouts = 0
- lines = data.split("\n")
+ data = ""
+ while len(data) < 65536:
+ chunk = s.recv(8192)
+ if not chunk:
+ break
+ data += chunk
+
+ if DEBUG:
+ logging.info("%s: len(data) = %d" % (self.mach, len(data)))
+ if len(data) > 0:
+ lines = data.split("\n")
+ if len(lines) >= EXPECTED_LINES:
+ nowonline = True
+ self.timeouts = 0
+ else:
+ # XXX MCL
+ if DEBUG or True:
+ logging.info("%s: truncated reply: %s" % (self.mach, lines))
except socket.timeout:
if self.online:
logging.info("[%s] Connection timeout" % self.mach)
self.timeouts += 1
if self.timeouts < 3:
nowonline = self.online
- except:
+ except Exception, e:
+ print "pollmachine: exception in poll for %s:" %self.mach
+ print e
pass
finally:
try:
@@ -153,7 +174,7 @@ class MachinePoll(threading.Thread):
self.online = nowonline
if self.online:
self.timeouts = 0
- # XXX inform qmgr of state change
+ # XXX inform qmanager of state change
if self.online and not lines and not self.timeouts:
# reportload script is missing
@@ -180,7 +201,7 @@ class MachinePoll(threading.Thread):
if old != part[2]:
self.vars[part[0]] = part[2]
# logging.info("%s@%s: \"%s\" -> \"%s\"" % (part[0], self.mach, old, part[2]))
- # XXX update qmgr
+ # XXX inform qmanager
try:
envs = self.vars['buildenvs']
@@ -222,16 +243,31 @@ class MachinePoll(threading.Thread):
except KeyError:
pass
+ if DEBUG:
+ logging.info("%s recording current system load for %s" % ( str(self), self.mach ))
# Record current system load
+ # note: can fail on "file system full"
try:
f = file("%s/%s/loads/%s" % (pbd, self.arch, self.mach), "w")
- except:
+ except Exception, e:
+ print "pollmachine: exception in creating %s/%s/loads/%s:" % (pbd, self.arch, self.mach)
+ print e
return
try:
- f.write("%s %s\n" % (self.vars['jobs'], self.vars['load']))
- except:
- pass
- f.close()
+ if 'jobs' in self.vars and 'load' in self.vars:
+ f.write("%s %s\n" % (self.vars['jobs'], self.vars['load']))
+ else:
+ # machine is not responding to poll.
+ # XXX MCL remove from machines
+ # XXX inform qmanager
+ f.write("")
+ f.close()
+ except Exception, e:
+ print "pollmachine: exception in writing %s/%s/loads/%s:" % (pbd, self.arch, self.mach)
+ print self.vars
+ print e
+ if DEBUG:
+ logging.info("%s finished polling for %s" % ( str(self), self.mach ))
def setup(self, branch, buildid, args = ""):
cmd = "su ports-%s -c \"%s/scripts/dosetupnode %s %s %s %s %s\""\
@@ -275,13 +311,13 @@ while True:
for mach in gone:
logging.info("Removing machine %s/%s" % (arch, mach))
- # XXX disable from qmgr
+ # XXX disable from qmanager
pollthreads[mach].shutdown=True
del pollthreads[mach]
for mach in new:
logging.info("Adding machine %s/%s" % (arch, mach))
- # XXX set up qmgr
+ # XXX set up qmanager
pc="%s/%s/portbuild.conf" % (pbd, arch)
pch="%s/%s/portbuild.%s" % (pbd, arch, mach)
@@ -303,4 +339,10 @@ while True:
if not polldelay:
break
+ if DEBUG:
+ logging.info("Ready to sleep")
sleep(polldelay)
+ if DEBUG:
+ logging.info("Wakeup")
+
+logging.info("pollmachine: exiting.")
More information about the svn-src-projects
mailing list