aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorEelco Dolstra <e.dolstra@tudelft.nl>2010-02-03 20:35:37 +0000
committerEelco Dolstra <e.dolstra@tudelft.nl>2010-02-03 20:35:37 +0000
commitd0c32dc135f147ad352e28ff8c648e611516edec (patch)
treec474e310974cb3f9aa9acb0be1625c7d1cd50799 /scripts
parentf56a039775930d4ba2b4504440b7ab37dfefeb75 (diff)
* In the build hook, if connecting to a machine fails, try the other
machines of the right type (if available). This makes the build farm more robust to failures.
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/build-remote.pl.in143
-rw-r--r--scripts/nix-copy-closure.in2
-rw-r--r--scripts/ssh.pm5
3 files changed, 77 insertions, 73 deletions
diff --git a/scripts/build-remote.pl.in b/scripts/build-remote.pl.in
index 3ba4a60fd..da26b8596 100755
--- a/scripts/build-remote.pl.in
+++ b/scripts/build-remote.pl.in
@@ -71,6 +71,7 @@ while (<CONF>) {
, sshKeys => $3
, maxJobs => $4
, speedFactor => 1.0 * ($6 || 1)
+ , enabled => 1
};
}
@@ -92,89 +93,96 @@ sub openSlotLock {
}
-# Find all machine that can execute this build, i.e., that support
-# builds for the given platform and are not at their job limit.
-my $rightType = 0;
-my @available = ();
-LOOP: foreach my $cur (@machines) {
- if (grep { $neededSystem eq $_ } @{$cur->{systemTypes}}) {
- $rightType = 1;
-
- # We have a machine of the right type. Determine the load on
- # the machine.
- my $slot = 0;
- my $load = 0;
- my $free;
- while ($slot < $cur->{maxJobs}) {
- my $slotLock = openSlotLock($cur, $slot);
- if (flock($slotLock, LOCK_EX | LOCK_NB)) {
- $free = $slot unless defined $free;
- flock($slotLock, LOCK_UN) or die;
- } else {
- $load++;
+my $hostName;
+
+while (1) {
+
+ # Find all machine that can execute this build, i.e., that support
+ # builds for the given platform and are not at their job limit.
+ my $rightType = 0;
+ my @available = ();
+ LOOP: foreach my $cur (@machines) {
+ if ($cur->{enabled} && grep { $neededSystem eq $_ } @{$cur->{systemTypes}}) {
+ $rightType = 1;
+
+ # We have a machine of the right type. Determine the load on
+ # the machine.
+ my $slot = 0;
+ my $load = 0;
+ my $free;
+ while ($slot < $cur->{maxJobs}) {
+ my $slotLock = openSlotLock($cur, $slot);
+ if (flock($slotLock, LOCK_EX | LOCK_NB)) {
+ $free = $slot unless defined $free;
+ flock($slotLock, LOCK_UN) or die;
+ } else {
+ $load++;
+ }
+ close $slotLock;
+ $slot++;
}
- close $slotLock;
- $slot++;
- }
- push @available, { machine => $cur, load => $load, free => $free }
+ push @available, { machine => $cur, load => $load, free => $free }
if $load < $cur->{maxJobs};
+ }
}
-}
-if (defined $ENV{NIX_DEBUG_HOOK}) {
- print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n"
- foreach @available;
-}
+ if (defined $ENV{NIX_DEBUG_HOOK}) {
+ print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n"
+ foreach @available;
+ }
-# Didn't find any available machine? Then decline or postpone.
-if (scalar @available == 0) {
- # Postpone if we have a machine of the right type, except if the
- # local system can and wants to do the build.
- if ($rightType && !$canBuildLocally) {
- sendReply "postpone";
- exit 0;
- } else {
- decline;
+ # Didn't find any available machine? Then decline or postpone.
+ if (scalar @available == 0) {
+ # Postpone if we have a machine of the right type, except if the
+ # local system can and wants to do the build.
+ if ($rightType && !$canBuildLocally) {
+ sendReply "postpone";
+ exit 0;
+ } else {
+ decline;
+ }
}
-}
-# Prioritise the available machines as follows:
-# - First by load divided by speed factor, rounded to the nearest
-# integer. This causes fast machines to be preferred over slow
-# machines with similar loads.
-# - Then by speed factor.
-# - Finally by load.
-sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); }
-@available = sort
- { lf($a) <=> lf($b)
- || $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor}
- || $a->{load} <=> $b->{load}
- } @available;
+ # Prioritise the available machines as follows:
+ # - First by load divided by speed factor, rounded to the nearest
+ # integer. This causes fast machines to be preferred over slow
+ # machines with similar loads.
+ # - Then by speed factor.
+ # - Finally by load.
+ sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); }
+ @available = sort
+ { lf($a) <=> lf($b)
+ || $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor}
+ || $a->{load} <=> $b->{load}
+ } @available;
-# Select the best available machine and lock a free slot.
-my $selected = $available[0];
-my $machine = $selected->{machine};
+ # Select the best available machine and lock a free slot.
+ my $selected = $available[0];
+ my $machine = $selected->{machine};
-my $slotLock = openSlotLock($machine, $selected->{free});
-flock($slotLock, LOCK_EX | LOCK_NB) or die;
-utime undef, undef, $slotLock;
+ my $slotLock = openSlotLock($machine, $selected->{free});
+ flock($slotLock, LOCK_EX | LOCK_NB) or die;
+ utime undef, undef, $slotLock;
-close MAINLOCK;
+ close MAINLOCK;
+
+
+ # Connect to the selected machine.
+ @sshOpts = ("-i", $machine->{sshKeys}, "-x");
+ $hostName = $machine->{hostName};
+ last if openSSHConnection $hostName;
+
+ warn "unable to open SSH connection to $hostName, trying other available machines...\n";
+ $machine->{enabled} = 0;
+}
# Tell Nix we've accepted the build.
sendReply "accept";
-if (defined $ENV{NIX_DEBUG_HOOK}) {
- my $hostName = $machine->{hostName};
- my $sp = $machine->{speedFactor};
- print STDERR "building `$drvPath' on `$hostName' - $sp - " . $selected->{free} . "\n";
- sleep 10;
- exit 0;
-}
my $x = <STDIN>;
chomp $x;
@@ -184,13 +192,8 @@ if ($x ne "okay") {
# Do the actual build.
-my $hostName = $machine->{hostName};
print STDERR "building `$drvPath' on `$hostName'\n";
-push @sshOpts, "-i", $machine->{sshKeys}, "-x";
-
-openSSHConnection $hostName;
-
my $inputs = `cat inputs`; die if ($? != 0);
$inputs =~ s/\n/ /g;
diff --git a/scripts/nix-copy-closure.in b/scripts/nix-copy-closure.in
index 313d6f019..59046814b 100644
--- a/scripts/nix-copy-closure.in
+++ b/scripts/nix-copy-closure.in
@@ -53,7 +53,7 @@ while (@ARGV) {
}
-openSSHConnection $sshHost;
+openSSHConnection $sshHost or die "$0: unable to start SSH\n";
if ($toMode) { # Copy TO the remote machine.
diff --git a/scripts/ssh.pm b/scripts/ssh.pm
index 0295cef33..cea486675 100644
--- a/scripts/ssh.pm
+++ b/scripts/ssh.pm
@@ -12,15 +12,16 @@ sub openSSHConnection {
my ($host) = @_;
die if $sshStarted;
$sshHost = $host;
- return if system("ssh $sshHost @sshOpts -O check 2> /dev/null") == 0;
+ return 1 if system("ssh $sshHost @sshOpts -O check 2> /dev/null") == 0;
my $tmpDir = tempdir("nix-ssh.XXXXXX", CLEANUP => 1, TMPDIR => 1)
or die "cannot create a temporary directory";
push @sshOpts, "-S", "$tmpDir/control";
system("ssh $sshHost @sshOpts -M -N -f") == 0
- or die "unable to start SSH: $?";
+ or return 0;
$sshStarted = 1;
+ return 1;
}
# Tell the master SSH client to exit.