Rev 42 | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed
#!/usr/bin/env perl
# All functions related to maniplating/reporting on cluster
# part of havirt.
# Copyright 2024 Daily Data, Inc.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following
# conditions are met:
#
# Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the distribution.
# Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
# THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# v0.0.1 20240602 RWR
# Initial setup
#
# v1.2.0 20240826 RWR
# Added some code to migrate domains if node placed in maintenance mode
# Added a lot of 'verbose' print lines, and modified for new flag structure
#
# v1.3.0 20250511 RWR
# Added balance function. If called, will attempt to balance a cluster so that the variance is lower than balance_max_variance
# (new entry in config file). --dryrun will simply display the commands sent, and --nodryrun will execute them.
#
# v1.3.0 20250514 RWR
# Modified so it will not issue error codes if we have done iterations but not perfectly in balance. Also, added message
# 'already in balance' if it is as good as we can get it.
#
package cluster;
use warnings;
use strict;
# define the version number
# see https://metacpan.org/pod/release/JPEACOCK/version-0.97/lib/version.pod
use version;
our $VERSION = version->declare("1.3.1");
use Data::Dumper;
use Exporter;
our @ISA = qw( Exporter );
our @EXPORT = qw(
&list
&iscsi
);
sub help {
my @return;
push @return, 'cluster status';
push @return, "\t[--format|-f screen|tsv] - displays some stats on cluster resources used";
push @return, 'cluster balance';
push @return, "\tBalances resources by moving domains between nodes";
push @return, 'cluster iscsi';
push @return, "\tdisplays list of all iSCSI targets 'known' by system";
push @return, 'cluster iscsi add ip-or-dns-name';
push @return, "\tAdds iscsi target to system";
push @return, 'cluster iscsi delete ip-or-dns-name';
push @return, "\tDelete iSCSI target processed by system. ip-or-dns-name MUST be exact";
push @return, 'cluster iscsi update [node ...]';
push @return, "\tPerforms an update to add new iSCSI targets on one or more nodes";
push @return, "\tScans all iSCSI targets, looking for new shares on each, then performs";
push @return, "\ta login, adding it to the node. DOES NOT delete old targets at this";
push @return, "\ttime. If no nodes passed in, will perform function on all nodes not";
push @return, "\tin maintenance mode";
push @return, 'cluster balance';
push @return, "\tAttempts to balance node memory usage by migrating domains to less used";
push @return, "\tnodes. If a node is in maintenance mode, will attempt to move all domains";
push @return, "\toff of it and balance them on the other nodes";
return join( "\n", @return ) . "\n";
}
sub status {
my $return = '';
&main::readDB();
my @header = ('Node','Threads','Memory','Domains','vcpu','mem_used', 'Status' );
my @data;
my $usedmem = 0;
my $usedcpu = 0;
my $availmem = 0;
my $availcpu = 0;
my $totalDomains = 0;
my $maintenance = 0;
foreach my $node (sort keys %{ $main::statusDB->{'node'} } ) {
my $memory = 0;
my $vcpus = 0;
my $count = 0;
foreach my $domain ( keys %{ $main::statusDB->{'nodePopulation'}->{$node}->{'running'} } ) {
$memory += $main::statusDB->{'virt'}->{$domain}->{'memory'};
$vcpus += $main::statusDB->{'virt'}->{$domain}->{'vcpu'};
$count++;
}
push @data, [ $node,$main::statusDB->{'node'}->{$node}->{cpu_count},$main::statusDB->{'node'}->{$node}->{memory},$count,$vcpus,$memory, $main::statusDB->{'node'}->{$node}->{maintenance} ? 'Maintenance' : 'Online' ];
$usedmem += $memory;
$usedcpu += $vcpus;
$totalDomains += $count;
$availmem += $main::statusDB->{'node'}->{$node}->{memory};
$availcpu += $main::statusDB->{'node'}->{$node}->{cpu_count};
$maintenance += $main::statusDB->{'node'}->{$node}->{maintenance} ? 0 : 1;
} # outer for
push @data, [ 'Total',$availcpu,$availmem,$totalDomains,$usedcpu,$usedmem, $maintenance ];
return &main::report( \@header, \@data );
}
# perform various functions on iSCSI target definitions
# on all nodes
sub iscsi {
my $action = shift;
my @return;
if ( $action && $action eq 'add' ) {
&main::readDB(1);
while ( my $target = shift ) {
$main::statusDB->{'cluster'}->{'iscsi'}->{$target} = '';
}
&main::writeDB();
} elsif ( $action && $action eq 'delete' ) {
my $target = shift;
&main::readDB(1);
delete $main::statusDB->{'cluster'}->{'iscsi'}->{$target} if exists $main::statusDB->{'cluster'}->{'iscsi'}->{$target};
&main::writeDB();
} elsif ( $action && $action eq 'update' ) {
&main::readDB();
# if they did not give us a node, do all of them
@_ = keys %{ $main::statusDB->{'node'} } unless @_;
while ( my $node = shift ) { # process each node on stack
if ( $main::statusDB->{'node'}->{$node}->{'maintenance'} ) {
print "Not processing node $node since it is in maintenance mode\n" if $main::config->{'flags'}->{'verbose'};
} else { # actually do the work
push @return, &updateISCITargets( $node );
}
} # while
}
&main::readDB();
push @return, "iSCSI targets are";
if ( $main::statusDB->{'cluster'}->{'iscsi'} ) {
push @return, join( "\n", keys %{ $main::statusDB->{'cluster'}->{'iscsi'} } );
} else {
push @return, "None Defined";
}
return join( "\n", @return ) . "\n";
}
# updates iSCSI targets on $node
# scans each target defined and compares it to the current session
# adding new targets if they exist
# NOTE: does not delete targets which no longer exist on server
sub updateISCITargets {
my $node = shift;
my $command;
my %targets;
my @return;
push @return, "Processing iSCSI targets on $node";
print Dumper( keys %{ $main::statusDB->{'cluster'}->{'iscsi'} } ) if $main::config->{'flags'}->{'debug'};
foreach my $server (keys %{ $main::statusDB->{'cluster'}->{'iscsi'} } ) {
print "\n" . '-'x40 . "\nGetting targets on server $server\n" . '-'x40 . "\n" if $main::config->{'flags'}->{'verbose'};
$command = &main::makeCommand( $node, "iscsiadm -m discovery -t st -p $server" );
my @list = `$command`;
chomp @list;
# @list contains lines of type
# 10.19.209.2:3260,1 iqn.2014-11.net.dailydata.castor:simon0
# split them apart and add them to the hash
foreach my $entry ( @list ) {
my ( $portal, $targetName ) = split( ' ', $entry );
# $portal has some extra info after a comma, so clean it up
$portal =~ m/^([0-9:.]+)/;
$portal = $1;
# some targets return multiple IP's for a given name, so
# only add them if they are in this IP
$targets{ $targetName } = $portal if $portal =~ m/^$server/;
print "$targetName\t$targets{ $targetName }\n" if $main::config->{'flags'}->{'verbose'};
} # foreach
} # while
print "\n" . '-'x40 . "\nGetting active sessions\n". '-'x40 . "\n" if $main::config->{'flags'}->{'verbose'};
# now, get active sessions so we can filter them
$command = &main::makeCommand( $node, "iscsiadm -m session" );
my @activeSessions = `$command`;;
chomp @activeSessions;
foreach my $session ( @activeSessions ) {
$session =~ m/^.*[^0-9:.]([0-9,:.]+).*(iqn\S*)/;
my ( $portal,$targetName ) = ( $1,$2 );
print "$portal\t$targetName" if $main::config->{'flags'}->{'verbose'};
if ( exists( $targets{$targetName} ) ) {
print "\tNOT updating\n" if $main::config->{'flags'}->{'verbose'};
delete $targets{ $targetName };
} else {
print "Needs to be added\n" if $main::config->{'flags'}->{'verbose'};
}
}
# check if we have any new entries and bail if not
if ( scalar keys %targets ) {
# We have new entries, so run them;
foreach my $targetName ( sort keys %targets ) {
my $portal = $targets{$targetName};
push @return, "Adding $targetName";
$command = &main::makeCommand( $node, "iscsiadm -m node --targetname '$targetName' --portal '$portal' --login" );
if ( $main::config->{'flags'}->{'dryrun'} ) {
push @return, $command;
} else {
`$command`;
}
}
} else {
push @return, "No new entries";
}
return join( "\n", @return ) . "\n";
} # updateISCITargets
# calculate stats about the cluster, including the amount of memory/cpu used, the standard deviation
# and variance. Used mainly to balance cluster
sub getClusterStats {
my $return = {};
$return->{'cluster'}->{'memory'} = 0;
$return->{'cluster'}->{'used_memory'} = 0;
$return->{'cluster'}->{'count'} = 0;
$return->{'cluster'}->{'used_vcpu'} = 0;
$return->{'cluster'}->{'domain_count'} = 0;
foreach my $node (sort keys %{ $main::statusDB->{'node'} } ) {
# only count nodes which are not in maintenance as part of the cluster towards total memory available
if ( ! $main::statusDB->{'node'}->{$node}->{'maintenance'} ) {
$return->{'cluster'}->{'memory'} += $main::statusDB->{'node'}->{$node}->{'memory'};
$return->{'cluster'}->{'vcpu'} += $main::statusDB->{'node'}->{$node}->{'cpu_count'};
$return->{'cluster'}->{'count'}++;
} else {
$return->{'node'}->{$node}->{'maintenance'} = 1;
}
$return->{'node'}->{$node}->{'memory'} = $main::statusDB->{'node'}->{$node}->{'memory'};
$return->{'node'}->{$node}->{'vcpu'} = $main::statusDB->{'node'}->{$node}->{'cpu_count'};
$return->{'node'}->{$node}->{'used_memory'} = 0;
$return->{'node'}->{$node}->{'count'} = 0;
$return->{'node'}->{$node}->{'used_vcpu'} = 0;
# get individual stats for every domain on the node
foreach my $domain ( keys %{ $main::statusDB->{'nodePopulation'}->{$node}->{'running'} } ) {
# track used memory, and count
$return->{'node'}->{$node}->{'used_memory'} += $main::statusDB->{'virt'}->{$domain}->{'memory'};
$return->{'node'}->{$node}->{'used_vcpu'} += $main::statusDB->{'virt'}->{$domain}->{'vcpu'};
$return->{'node'}->{$node}->{'count'}++;
}
# calculate the average memory used in the node
$return->{'node'}->{$node}->{'average_memory'} = $return->{'node'}->{$node}->{'used_memory'} /
(
$main::statusDB->{'node'}->{$node}->{'maintenance'} ? 0.0001 : $main::statusDB->{'node'}->{$node}->{'memory'}
);
# add the used memory to the cluster
$return->{'cluster'}->{'used_memory'} += $return->{'node'}->{$node}->{'used_memory'};
$return->{'cluster'}->{'used_vcpu'} += $return->{'node'}->{$node}->{'used_vcpu'};
$return->{'cluster'}->{'domain_count'} += $return->{'node'}->{$node}->{'count'};
}
# calculate the deviation for each active node in the cluster
$return->{'cluster'}->{'average_memory'} = $return->{'cluster'}->{'used_memory'} / $return->{'cluster'}->{'memory'};
# get the deviation for each node
# variance in the cluster is simply the average of all deviations
$return->{'cluster'}->{'variance'} = 0;
foreach my $node (sort keys %{ $main::statusDB->{'node'} } ) {
# deviation is the square of the difference between this node and the cluster overall
$return->{'node'}->{$node}->{'deviation'} = (
$return->{'node'}->{$node}->{'average_memory'} / $return->{'cluster'}->{'average_memory'}
) ** 2;
# we'll divide by number of active nodes after the loop
$return->{'cluster'}->{'variance'} += $return->{'node'}->{$node}->{'deviation'};
}
$return->{'cluster'}->{'variance'} /= $return->{'cluster'}->{'count'};
# now, determine how much memory needs to be added (plus) or removed (minus) for each node
# memory_needed is calculated by taking the total amount of memory and multiplying it by the cluster average memory
# then subtracting whatever is already used
foreach my $node (sort keys %{ $main::statusDB->{'node'} } ) {
if ( $main::statusDB->{'node'}->{$node}->{'maintenance'} ) {
$return->{'node'}->{$node}->{'memory_needed'} = -1 * $return->{'node'}->{$node}->{'used_memory'};
} else {
$return->{'node'}->{$node}->{'memory_needed'} = int (
( $return->{'node'}->{$node}->{'memory'} * $return->{'cluster'}->{'average_memory'} ) -
$return->{'node'}->{$node}->{'used_memory'}
);
}
}
return $return;
}
sub humanReadable {
my ( $value, $preferredUnits ) = @_;
$value *= 1024;
my @units = ( '', 'k', 'M', 'G', 'T' );
$preferredUnits = $units[-1] unless $preferredUnits;
my $unit = 0;
while ( $unit < @units && abs($value) > 1023 && lc $units[$unit] ne lc $preferredUnits ) {
$unit++;
$value /= 1024;
}
return sprintf( '%d%s', $value+0.5, $units[$unit] );
}
sub percent {
my ($value, $accuracy) = @_;
$accuracy = 0 unless $accuracy;
return sprintf( '%2.' . $accuracy . 'f%%', $value * 100)
}
# Creates a balance report to show the user what went on
# $cluster is a hash created by sub getClusterStats, and possibly modified by
# the calling process
sub showBalanceReport {
my $stats = shift;
#die Dumper( $stats ) . "\n";
my @header = ('Node','Threads','Memory','Domains','vcpu_alloc','mem_alloc', 'mem_needed', 'vcpu%', 'mem%', 'Status', 'StdDev' );
my @data;
foreach my $node ( sort keys %{ $stats->{'node'} } ) {
push @data, [
$node,
$stats->{'node'}->{$node}->{'vcpu'},
&humanReadable( $stats->{'node'}->{$node}->{'memory'} ),
$stats->{'node'}->{$node}->{'count'},
$stats->{'node'}->{$node}->{'used_vcpu'},
&humanReadable( $stats->{'node'}->{$node}->{'used_memory'} ),
&humanReadable( $stats->{'node'}->{$node}->{'memory_needed'} ),
&percent( $stats->{'node'}->{$node}->{'used_vcpu'} / $stats->{'node'}->{$node}->{'vcpu'} ),
&percent( $stats->{'node'}->{$node}->{'used_memory'} / $stats->{'node'}->{$node}->{'memory'} ),
$stats->{'node'}->{$node}->{'maintenance'} ? 'Maintenance' : '',
$stats->{'node'}->{$node}->{'deviation'} < 1000 ? sprintf( "%2.2f", $stats->{'node'}->{$node}->{'deviation'} ) : 'undef'
];
}
push @data, [
'All',
$stats->{'cluster'}->{'vcpu'},
&humanReadable( $stats->{'cluster'}->{'memory'} ),
$stats->{'cluster'}->{'domain_count'},
$stats->{'cluster'}->{'used_vcpu'},
&humanReadable( $stats->{'cluster'}->{'used_memory'} ),
'',
&percent( $stats->{'cluster'}->{'used_vcpu'} / $stats->{'cluster'}->{'vcpu'} ),
&percent( $stats->{'cluster'}->{'used_memory'} / $stats->{'cluster'}->{'memory'} ),
'',
''
];
return &main::report( \@header, \@data ) . "Variance " .
( $stats->{'cluster'}->{'variance'} < 100 ? sprintf( "%2.2f", $stats->{'cluster'}->{'variance'} + .005 ) : "undef" ) . "\n\n";
}
# simulates performing migrations. Simply moves entries from $from to $to in $main::statusDB->{'nodePopulation'}
sub doActions {
my $actions = shift;
my $return = '';
for ( my $i = 0; $i < @$actions; $i++ ) {
my ($domain, $source, $target, $size ) = split( "\t", $actions->[$i] );
$return .= &main::migrate( $domain, $target, $source );
delete $main::statusDB->{'nodePopulation'}->{$source}->{'running'}->{$domain};
$main::statusDB->{'nodePopulation'}->{$target}->{'running'}->{$domain} = time;
}
&main::forceScan() unless $main::config->{'flags'}->{'dryrun'} || $main::config->{'flags'}->{'testing'};
return $return;
}
# attempt to balance the domains on the active (maintenance = false) nodes
# basically, we take what is currently working, and calculate the variance
# of it (see https://en.wikipedia.org/wiki/Standard_deviation). If that is
# over about a 10, we move things around, if possible, then check our variance
# again.
sub balance {
&main::readDB();
my $return = '';
# get the current cluster status
my $cluster = &getClusterStats();
#die Dumper( $cluster ) . "\n";
# show user what it looks like at first
print "=== Starting Status ===\n\n" . &showBalanceReport( $cluster) unless $main::config->{'flags'}->{'quiet'};
# we will do a loop to get the variance within our preferred range ($main::config->{ 'balance variance'})
# however, we will only do a maximum number of iterations ($main::config->{ 'balance maxiterations'})
my $iterations = defined $main::config->{ 'balance_max_iterations'} && $main::config->{ 'balance_max_iterations'} ? $main::config->{ 'balance_max_iterations'} : 10;
$main::config->{ 'balance_max_variance'} = 1.1 unless defined $main::config->{ 'balance_max_variance'};
# continue until our variance is where we want it, or we have tried too many times.
while ( $iterations-- && $cluster->{'cluster'}->{'variance'} > $main::config->{ 'balance_max_variance'} ) {
my $actions = &moveThings( $cluster );
if ( my $output = &doActions( $actions ) ) {
$return .= $output;
} else {
last;
}
#print Dumper( $actions ) . "\n"; die;
# rerun stats
$cluster = &getClusterStats();
print &showBalanceReport( $cluster) if $main::config->{'flags'}->{'verbose'} > 1;
}
print "=== Ending Status ===\n\n" . &showBalanceReport( $cluster) unless $main::config->{'flags'}->{'quiet'};
return $return ? $return : "Already Balanced: No actions to take\n";
} # balance
# finds node which needs to lose ($from) and gain ($to) the most. Then, goes through $from and finds the largest
# domain which will fit on $to until exhausted.
# as each domain is found, appends to $actions (array pointer). The format of each entry is a tab separated
# list of domain name, node from, node to, domain size
# returns the modified $actions
sub moveThings {
my $stats = shift;
my $actions = [];
# find largest and smallest node differences
my $transfer;
my $from = '';
my $to = '';
# find smallest and largest "memory needed" in group. Note that if a node has too much, the number is negative and
# for too little (ie, needs additional), the number is positive
foreach my $node (keys %{$stats->{'node'} } ) {
#print "Checking $node\n";
if ( $from ) {
$from = $node if $stats->{'node'}->{$from}->{'memory_needed'} > $stats->{'node'}->{$node}->{'memory_needed'};
$to = $node if $stats->{'node'}->{$to}->{'memory_needed'} < $stats->{'node'}->{$node}->{'memory_needed'};
} else { # just initialize everything to this node
$from = $to = $node;
} #if .. else
} # foreach
# this is a poor mans min. we want to transfer the least number of bytes, ie what $from can spare, or what $to can accept
# we need the smallest of what $from can give and $to can accept
$transfer = abs( abs( $stats->{'node'}->{$from}->{'memory_needed'} ) > abs( $stats->{'node'}->{$to}->{'memory_needed'} ) ?
$stats->{'node'}->{$to}->{'memory_needed'} : $stats->{'node'}->{$from}->{'memory_needed'} );
# die "Transfer " . &humanReadable($transfer) ." bytes from $from to $to\n";
# get array of domains running on $from, sorted by the size of the domain (descending, ie largest on top )
# basically, get all keys from $main::statusDB->{'nodePopulation'}->{$from}->{'running'}, then sort them by looking them
# up in $main::statusDB->{'virt'} and retrieving the amount of RAM
my @sortedDomains = sort
{
$main::statusDB->{'virt'}->{$b}->{'memory'} <=> $main::statusDB->{'virt'}->{$a}->{'memory'}
} keys %{ $main::statusDB->{'nodePopulation'}->{$from}->{'running'} };
# now, "move" (fake move) largest domain that will fit into $to, and repeat until we can not do it anymore
while ( $transfer ) {
my $thisDomain = shift @sortedDomains;
last unless $thisDomain; # we ran out of domains
next unless $main::statusDB->{'virt'}->{$thisDomain}->{'memory'} <= $transfer;
push @$actions, join( "\t", ( $thisDomain, $from, $to, $main::statusDB->{'virt'}->{$thisDomain}->{'memory'} ) );
$transfer -= $main::statusDB->{'virt'}->{$thisDomain}->{'memory'};
}
return $actions;
}