1 #!/bin/ksh
   2 #
   3 # CDDL HEADER START
   4 #
   5 # The contents of this file are subject to the terms of the
   6 # Common Development and Distribution License (the License).
   7 # You may not use this file except in compliance with the License.
   8 #
   9 # You can obtain a copy of the license at usr/src/CDDL.txt
  10 # or http://www.opensolaris.org/os/licensing.
  11 # See the License for the specific language governing permissions
  12 # and limitations under the License.
  13 #
  14 # When distributing Covered Code, include this CDDL HEADER in each
  15 # file and include the License file at usr/src/CDDL.txt.
  16 # If applicable, add the following below this CDDL HEADER, with the
  17 # fields enclosed by brackets [] replaced with your own identifying
  18 # information: Portions Copyright [yyyy] [name of copyright owner]
  19 #
  20 # CDDL HEADER END
  21 #
  22 # Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23 # Use is subject to license terms.
  24 #
  25 # ident "%Z%%M% %I%     %E% SMI"
  26 #
  27 
  28 PKG=SUNWscxvm
  29 TASK_COMMAND=""
  30 RESOURCE_PROJECT_NAME=""
  31 CCR_TABLE=${VM}_"domain_config"
  32 TMP_DIR="/var/tmp"
  33 LOGFILE=${TMP_DIR}/${RESOURCE}_logfile
  34 
  35 # Commands definition
  36 SCLOGGER=/usr/cluster/lib/sc/scds_syslog
  37 LOGGER=/usr/bin/logger
  38 GREP=/usr/xpg4/bin/grep
  39 AWK=/usr/bin/awk
  40 PGREP=/usr/bin/pgrep
  41 SLEEP=/usr/bin/sleep
  42 TR=/usr/xpg4/bin/tr
  43 SCHA_RESOURCE_GET=/usr/cluster/bin/scha_resource_get
  44 SCHA_RESOURCEGROUP_GET=/usr/cluster/bin/scha_resourcegroup_get
  45 SCHA_CLUSTER_GET=/usr/cluster/bin/scha_cluster_get
  46 HATIMERUN=/usr/cluster/bin/hatimerun
  47 LDM=/opt/SUNWldm/bin/ldm
  48 VIRSH=/usr/bin/virsh
  49 XM=/usr/sbin/xm
  50 CCRADM=/usr/cluster/lib/sc/ccradm
  51 CL_EXEC_CLIENT=/usr/cluster/lib/sc/cl_exec_client
  52 
  53 syslog_tag()
  54 {
  55         ${SET_DEBUG}
  56         print "SC[${PKG:-??}.${METHOD:-??}]:${RESOURCEGROUP:-??}:${RESOURCE:-??}"
  57 }
  58 
  59 scds_syslog()
  60 {
  61         if [ -f "${SCLOGGER}" ]
  62         then
  63            ${SCLOGGER} "$@" &
  64         else
  65            while getopts 'p:t:m' opt
  66            do
  67               case "${opt}" in
  68                  t) TAG=${OPTARG};;
  69                  p) PRI=${OPTARG};;
  70               esac
  71            done
  72       
  73            shift $((${OPTIND} - 1))
  74            LOG_STRING=$(/usr/bin/printf "$@")
  75            ${LOGGER} -p ${PRI} -t ${TAG} ${LOG_STRING}
  76         fi
  77 }
  78 
  79 i18n_message()
  80 {
  81         debug_message "Function: i18n_message - Begin"
  82         ${DEBUG}
  83 
  84         print -u2 $(/bin/printf "$@")
  85 
  86         debug_message "Function: i18n_message - End"
  87         return 0
  88 }
  89 
  90 debug_message()
  91 {
  92         typeset DEBUG_TEXT=
  93    
  94         case ${DEBUG_LEVEL} in
  95            0)   # No debug msgs
  96               SET_DEBUG=
  97               ;;
  98            1)  # Begin and End msgs
  99               SET_DEBUG=
 100               DEBUG_TEXT=$(echo ${1} | ${GREP} -E 'Begin|End')
 101               ;;
 102            2)  # All debug msgs
 103               SET_DEBUG="set -x"
 104               DEBUG_TEXT=${1}
 105               ;;
 106         esac
 107    
 108         [[ -n "${DEBUG_TEXT}" ]] && \
 109              scds_syslog -p daemon.debug -t $(syslog_tag) -m \
 110                   "%s" "${DEBUG_TEXT}"
 111 }
 112 
 113 log_message()
 114 {
 115         #
 116         # Output a message to syslog as required
 117         #
 118    
 119         debug_message "Function: log_message - Begin"
 120    
 121         ${SET_DEBUG}
 122    
 123         if [ -s "${LOGFILE}" ]
 124         then
 125            PRIORITY=${1}
 126            HEADER=${2}
 127       
 128            #
 129            # Ensure that the while loop only reads a closed file
 130            #
 131            strings ${LOGFILE} > ${LOGFILE}.copy
 132            while read MSG_TXT
 133            do
 134               scds_syslog -p daemon.${PRIORITY} -t $(syslog_tag) -m \
 135                    "%s - %s" "${HEADER}" "${MSG_TXT}"
 136            done < ${LOGFILE}.copy
 137         fi
 138    
 139         cat /dev/null > ${LOGFILE} > /dev/null
 140         cat /dev/null > ${LOGFILE}.copy
 141    
 142         debug_message "Function: log_message - End"
 143 }
 144 
 145 
 146 get_resource_property()
 147 {
 148         debug_message "Function: get_resource_property - Begin"
 149         ${SET_DEBUG}
 150    
 151         typeset RS=${1}
 152         typeset PROPERTY=${2}
 153         typeset rc
 154    
 155         # Retrieve the property value.
 156         OUTPUT=$(${SCHA_RESOURCE_GET} -O Extension -R ${RS} ${PROPERTY})
 157         rc=${?}
 158    
 159         debug_message "get_resource_property - " \
 160              "scha_resource_get of property ${PROPERTY} returned ${rc}"
 161    
 162         if (( ${rc} == 0 ))
 163         then
 164            # print the values
 165            echo ${OUTPUT} | ${AWK} '{ \
 166                 if (NF > 1) for (i = 2; i <= NF; i++) print $i; else print "" }'
 167         fi
 168    
 169         debug_message "Function: get_resource_property - End"
 170    
 171         return ${rc}
 172 }
 173 
 174 
 175 get_properties()
 176 {
 177         debug_message "Function: get_properties - Begin"
 178         ${SET_DEBUG}
 179    
 180         typeset -i rc
 181         typeset props=$*
 182    
 183         for prop in ${props}
 184         do
 185            # retrieve the property value
 186            typeset val=$(get_resource_property ${RESOURCE} ${prop})
 187            rc=${?}
 188       
 189            if (( ${rc} == 0 ))
 190            then
 191               case ${prop} in
 192                  Domain_name)    [[ -z ${DOMAIN} ]] && DOMAIN=${val};;
 193                  Migration_type) [[ -z ${MIGRATION_TYPE} ]] && MIGRATION_TYPE=${val};;
 194                  Plugin_probe)   [[ -z ${PLUGIN_PROBE} ]] && PLUGIN_PROBE=${val};;
 195                  Password_file)  [[ -z ${PASSWORD_FILE} ]] && PASSWORD_FILE=${val};;
 196                  Debug_level)    [[ -z ${DEBUG_LEVEL} ]] && DEBUG_LEVEL=${val};;
 197               esac
 198            else
 199               # SCMSGS
 200               # @explanation
 201               # The scha_resource_get call failed.
 202               # @user_action
 203               # Check the syslog for further messages.
 204               scds_syslog -p daemon.error -t $(syslog_tag) -m \
 205                    "Cannot get the property %s of resource %s." \
 206                    "${prop}" "${RESOURCE}"
 207               break
 208            fi
 209         done
 210    
 211         debug_message "Function: get_properties - End"
 212    
 213         return ${rc}
 214 }
 215 
 216 validate_xvm()
 217 {
 218         debug_message "Function: validate_xvm - Begin"
 219         ${SET_DEBUG}
 220 
 221         typeset rc=0
 222         typeset msgtext
 223    
 224         if [ "$(/usr/bin/uname -i)" != "i86xpv" ]
 225         then
 226            # SCMSGS
 227            # @explanation
 228            # Solaris is not booted with xVM.
 229            # @user_action
 230            # Ensure that the default boot grub menu is set to boot
 231            # Solaris xVM.
 232            scds_syslog -p daemon.error -t $(syslog_tag) -m \
 233                 "Node is not booted with xVM."
 234 
 235            msgtext=$(gettext "Node is not booted with xVM.")
 236            i18n_message "${msgtext}" 
 237 
 238            rc=1
 239         fi
 240 
 241         debug_message "Function: validate_xvm - End"
 242 
 243         return ${rc}
 244 }
 245 
 246 validate_ldom()
 247 {
 248         debug_message "Function: validate_ldom - Begin"
 249         ${SET_DEBUG}
 250 
 251         typeset ncount=0
 252         typeset msgtext
 253 
 254         # Make sure that the password file is readable.
 255         if [[ ${MIGRATION_TYPE} != "NORMAL" ]]
 256         then
 257            if [ -z "${PASSWORD_FILE}" ]
 258            then
 259               # SCMSGS
 260               # @explanation
 261               # Password file cannot be null.
 262               # @user_action
 263               # Ensure that a password file name is specified.
 264               scds_syslog -p daemon.error -t $(syslog_tag) -m \
 265                  "Password file cannot be (null)."
 266 
 267               msgtext=$(gettext "Password file cannot be (null).")
 268               i18n_message "${msgtext}"
 269 
 270               debug_message "Function: validate_ldom - End"
 271               return 1
 272            fi
 273 
 274            if [[ ! -f "${PASSWORD_FILE}" ]] || [[ ! -r "${PASSWORD_FILE}" ]]
 275            then
 276               # SCMSGS
 277               # @explanation
 278               # Incorrect Password file specified.
 279               # @user_action
 280               # Ensure that a valid password file is specified.
 281               scds_syslog -p daemon.error -t $(syslog_tag) -m \
 282                  "Invalid password file specified %s." \
 283                  "${PASSWORD_FILE}"
 284 
 285               msgtext=$(gettext "Invalid password file specified %s.")
 286               i18n_message "${msgtext}" "${PASSWORD_FILE}"
 287 
 288               debug_message "Function: validate_ldom - End"
 289               return 1
 290            fi
 291         fi
 292         
 293         # Ensure that the control domain is a cluster node.
 294         if ! ${LDM} ls > /dev/null 2>&1
 295         then
 296            # SCMSGS
 297            # @explanation
 298            # Self explanatory.
 299            # @user_action
 300            # Ensure that the resource is configured in
 301            # control domain.
 302            scds_syslog -p daemon.error -t $(syslog_tag) -m \
 303                "The LDom Manager is running in configuration mode."
 304 
 305            msgtext=$(gettext "The LDom Manager is running in configuration mode.")
 306            i18n_message "${msgtext}"
 307 
 308            debug_message "Function: validate_ldom - End"
 309            return 1
 310         fi
 311 
 312         # Ensure that the failure-policy setting is set to "reset".
 313         # If the control domain fails,this would allow the guest domains
 314         # to panic. 
 315         policy=$(${LDM} list -o domain primary \
 316             | ${AWK} -F"=" '$1~/failure-policy/ {print $2}')
 317 
 318         if [ "${policy}" != "reset" ]
 319         then
 320            # SCMSGS
 321            # @explanation
 322            # Incorrect failure-policy setting for the domain.
 323            # @user_action
 324            # Ensure that the failure-policy for the domain is
 325            # set to "reset" on the control domain.
 326            scds_syslog -p daemon.error -t $(syslog_tag) -m \
 327               "Invalid failure policy \"%s\" for %s domain." \
 328               "${policy}" "primary"
 329 
 330            msgtext=$(gettext "Invalid failure policy \"%s\" for %s domain.")
 331            i18n_message "${msgtext}" "${policy}" "primary"
 332 
 333            debug_message "Function: validate_ldom - End"
 334            return 1
 335         fi
 336 
 337         # The CL_EXEC_CLIENT program executes a command on any of the 
 338         # cluster nodes or a zone or in a zone cluster. It then generates
 339         # as output the exit status of command and the stdout and stderr
 340         # messages. The valid options are:
 341         #     [ -z zoneclustername] The command is run on the zone cluster
 342         # represented by the zonename.
 343         #     -C { TS | RT | FSS | FX } The scheduling class in which the
 344         #  command is to be run.
 345         #     -p pri Specifies the priority of the command in the given
 346         # scheduling class.
 347         #     -n id[,id..] A comma seperated list of node ID's of a
 348         # zone cluster or a node to run the command.
 349         #     -c cmd [Args] The command to be run along with its arguments.
 350 
 351         for nodename in $(${SCHA_RESOURCEGROUP_GET} -O NODELIST -G ${RESOURCEGROUP})
 352         do
 353            if [[ "$(${SCHA_CLUSTER_GET} -O NodeState_Node ${nodename})" == "DOWN" ]]
 354            then
 355               continue
 356            fi
 357 
 358            nodeid=$(${SCHA_CLUSTER_GET} -O NODEID_NODENAME ${nodename})
 359            output=$(${CL_EXEC_CLIENT} -n ${nodeid} -c "${LDM} list-domain ${DOMAIN}")
 360            result=${?}
 361            status=$(echo ${output} | ${AWK} '{print $6}')
 362 
 363            if (( ${result} == 0 )) && (( ${status} == 0 ))
 364            then
 365               domstate=$(echo $output | ${AWK} -F" " '{print $18}')     
 366 
 367               if (( ${update} == 0)) && echo $domstate | ${GREP} -q -E "^active$|suspending|resuming|suspended|starting" > /dev/null 2>&1
 368               then
 369                  # SCMSGS
 370                  # @explanation
 371                  # The domain is in an invalid state.
 372                  # @user_action
 373                  # Ensure that the domain is in inactive or bound state.
 374                  scds_syslog -p daemon.error -t $(syslog_tag) -m \
 375                     "Domain %s is in %s state on %s." \
 376                     "${DOMAIN}" "${domstate}" "${nodename}"
 377 
 378                    msgtext=$(gettext "Domain %s is in %s state on %s.")
 379                    i18n_message "${msgtext}" "${DOMAIN}" "${domstate}" "${nodename}"
 380 
 381                  debug_message "Function: validate_ldom - End"
 382                  return 1
 383               fi
 384 
 385               ncount=$((ncount+1))
 386               nlist=$(echo ${nodename} ${nlist})
 387 
 388               # dump domain confguration to ccr
 389               if [[ "$(/usr/bin/hostname)" == "${nodename}" ]]
 390               then
 391                  if ! dump_domain_config
 392                  then
 393                     debug_message "Function: validate_ldom - End"
 394                     return 1
 395                  fi
 396               fi
 397             fi
 398         done
 399 
 400         if (( ${ncount} == 0 ))
 401         then
 402            if ! ${CCRADM} showkey --key xml_${RESOURCE} ${CCR_TABLE} > /dev/null 2>&1
 403            then
 404               scds_syslog -p daemon.error -t $(syslog_tag) -m \
 405                  "Domain %s does not exist." \
 406                  "${DOMAIN}"
 407 
 408               msgtext=$(gettext "Domain %s does not exist.")
 409               i18n_message "${msgtext}" "${DOMAIN}"
 410 
 411               return 1
 412            fi
 413         fi
 414 
 415         if [[ ${ncount} -gt 1 ]]
 416         then
 417            # SCMSGS
 418            # @explanation
 419            # The domain is configured on multiple 
 420            # cluster nodes.
 421            # @user_action
 422            # Ensure that the domain is configured on one node
 423            # of the cluster.
 424            scds_syslog -p daemon.error -t $(syslog_tag) -m \
 425               "Multiple domain %s configuration exists on %s." \
 426               "${DOMAIN}" "${nlist}"
 427 
 428               msgtext=$(gettext "Multiple domain %s configuration exists on %s.")
 429               i18n_message "${msgtext}" "${DOMAIN}" "${nlist}"
 430 
 431            return 1
 432         fi
 433 
 434         debug_message "Function: validate_ldom - End"
 435         return 0
 436 }
 437 
 438 validate()
 439 {
 440         debug_message "Function: validate - Begin"
 441         ${SET_DEBUG}
 442 
 443         typeset rc
 444 
 445         # Make sure that the plugin probe specified is readable.
 446         if [[ -n "${PLUGIN_PROBE}" ]]
 447         then
 448            if [ -f "${PLUGIN_PROBE}" ] && [ ! -r "${PLUGIN_PROBE}" ]
 449            then
 450                # SCMSGS
 451                # @explanation
 452                # Incorrect user probe file specified.
 453                # @user_action
 454                # Ensure that a valid user probe file is specified.
 455                scds_syslog -p daemon.error -t $(syslog_tag) -m \
 456                    "Invalid user probe file %s." \
 457                    "${PLUGIN_PROBE}"
 458 
 459                msgtext=$(gettext "Invalid user probe file %s.")
 460                i18n_message "${msgtext}" "${PLUGIN_PROBE}"
 461 
 462                return 1
 463            fi
 464         fi
 465    
 466         validate_${VM}
 467         rc=${?}   
 468 
 469         debug_message "Function: validate - End"
 470         return ${rc}
 471 }
 472 
 473 #
 474 # get the domain status 
 475 #
 476 get_xvm_status()
 477 {
 478         debug_message "Function: get_xvm_status - Begin"
 479         ${SET_DEBUG}
 480 
 481         typeset rc
 482 
 483         ${VIRSH} domstate ${DOMAIN}
 484         rc=${?}   
 485 
 486         debug_message "Function: get_xvm_status - End"
 487         return ${rc}
 488 }
 489 
 490 get_ldom_status()
 491 {
 492         debug_message "Function: get_ldom_status - Begin"
 493         ${SET_DEBUG}
 494 
 495         typeset rc=1
 496 
 497         OUTPUT=$(${LDM} list-domain ${DOMAIN})
 498 
 499         if (( ${?} == 0 ))
 500         then
 501            echo ${OUTPUT} | ${AWK} '{print $10}'
 502            rc=${?}
 503         fi
 504    
 505         debug_message "Function: get_ldom_status - End"
 506         return ${rc}
 507 }
 508 
 509 #
 510 # Routines to create the domain on the current cluster node.
 511 #
 512 add_xvm_domain()
 513 {
 514         debug_message "Function: add_xvm_domain - Begin"
 515         ${SET_DEBUG}
 516 
 517         typeset rc=0
 518    
 519         if ! ${VIRSH} define ${TMP_DIR}/${RESOURCE}.xml >> $LOGFILE 2>&1
 520         then
 521            # SCMSGS
 522            # @explanation
 523            # Defining the domain using an XML file failed.
 524            # @user_action
 525            # The command /usr/bin/virsh define failed to define the domain.
 526            # Determine if you have specified the correct domain name while
 527            # registering the resource.
 528            scds_syslog -p daemon.error -t $(syslog_tag) -m \
 529                 "Failed to define %s using %s/%s.xml." \
 530                 "${DOMAIN}" "${TMP_DIR}" "${RESOURCE}"
 531            rc=1
 532         fi
 533    
 534         debug_message "Function: add_xvm_domain - End"
 535         return ${rc}
 536 }
 537 
 538 add_ldom_domain()
 539 {
 540         debug_message "Function: add_ldom_domain - Begin"
 541         ${SET_DEBUG}
 542 
 543         typeset rc=0
 544    
 545         if ! ${LDM} add-domain -i ${TMP_DIR}/${RESOURCE}.xml ${DOMAIN} >> $LOGFILE 2>&1
 546         then
 547            # SCMSGS
 548            # @explanation
 549            # Defining the domain using an XML file failed.
 550            # @user_action
 551            # The command /opt/SUNWldm/bin/ldm "add-domain"
 552            # failed to define the domain. Determine if you
 553            # have specified the correct domain name when
 554            # registering the resource.
 555            scds_syslog -p daemon.error -t $(syslog_tag) -m \
 556                 "Failed to add the domain %s using %s/%s.xml." \
 557                 "${DOMAIN}" "${TMP_DIR}" "${RESOURCE}"
 558            rc=1
 559         fi
 560    
 561         debug_message "Function: add_ldom_domain - End"
 562         return ${rc}
 563 }
 564 
 565 #
 566 # test if domain is active
 567 #
 568 is_xvm_up()
 569 {
 570         debug_message "Function: is_xvm_up - Begin"
 571         ${SET_DEBUG}
 572 
 573         typeset rc=0
 574 
 575         echo $(${VIRSH} domstate ${DOMAIN}) | \
 576              ${GREP} -q -E "running|blocked|paused|in shutdown" > /dev/null 2>&1
 577         rc=${?}
 578    
 579         debug_message "Function: is_xvm_up - End"
 580         return ${rc}
 581 }
 582 
 583 is_ldom_up()
 584 {
 585         debug_message "Function: is_ldom_up - Begin"
 586         ${SET_DEBUG}
 587 
 588         typeset rc=0
 589 
 590         get_ldom_status | ${GREP} -q -E "^active$|^starting$" > /dev/null 2>&1
 591         rc=${?}
 592    
 593         debug_message "Function: is_ldom_up - End"
 594         return ${rc}
 595 }
 596 
 597 #
 598 # wrapper routines to start xvm or ldom domains
 599 #
 600 start_xvm()
 601 {
 602         debug_message "Function: start_xvm - Begin"
 603         ${SET_DEBUG}
 604 
 605         typeset rc=0
 606 
 607         ${VIRSH} start ${DOMAIN} >> $LOGFILE 2>&1
 608         rc=${?}
 609    
 610         debug_message "Function: start_xvm - End"
 611         return ${rc}
 612 }
 613 
 614 #
 615 # After a crash/reboot of the node, the domain
 616 # would be started and there would be multiple
 617 # instances of the same domain across cluster
 618 # nodes. Hence the domain is destroyed.
 619 #
 620 init_ldom()
 621 {
 622         debug_message "Function: init_ldom - Begin"
 623         ${SET_DEBUG}
 624 
 625         typeset rc
 626 
 627         MAX_STOP_TIMEOUT=$(${SCHA_RESOURCE_GET} -O INIT_TIMEOUT \
 628            -R ${RESOURCE} -G ${RESOURCEGROUP} )
 629 
 630         domain_shutdown
 631         rc=${?}
 632 
 633         debug_message "Function: init_ldom - End"
 634         return ${rc}
 635 }
 636 
 637 start_ldom()
 638 {
 639         debug_message "Function: start_ldom - Begin"
 640         ${SET_DEBUG}
 641 
 642         typeset rc=0
 643 
 644         if get_${VM}_status | ${GREP} -q -E "^inactive$" > /dev/null 2>&1
 645         then
 646            if ${LDM} bind-domain ${DOMAIN} >> $LOGFILE 2>&1
 647            then
 648               # SCMSGS
 649               # @explanation
 650               # The domain was bound.
 651               # @user_action
 652               # None required. The domain has been bound on this node.
 653               scds_syslog -p daemon.notice -t $(syslog_tag) -m \
 654                    "Domain %s is bound." \
 655                    "${DOMAIN}"
 656               rc=0
 657            else
 658               # SCMSGS
 659               # @explanation
 660               # The /opt/SUNWldm/bin/ldm bind-domain command failed.
 661               # @user_action
 662               # Determine why it was not possible to bind the domain.
 663               scds_syslog -p daemon.error -t $(syslog_tag) -m \
 664                    "Failed to bind %s." \
 665                    "${DOMAIN}"
 666               rc=1
 667            fi
 668       
 669         fi
 670         
 671         #
 672         # The domain is made to sit at the OBP prompt, so a reboot/crash
 673         # wouldn't boot the Guest domain OS.
 674         # 
 675         if (( ${rc} == 0 )) && ${LDM} set-var auto-boot?=true ${DOMAIN} >> $LOGFILE 2>&1
 676         then
 677            if ${LDM} start-domain ${DOMAIN} >> $LOGFILE 2>&1
 678            then
 679               while [ 1 ]
 680               do
 681                  flag=$(${LDM} list-domain -p ${DOMAIN} | ${GREP} ${DOMAIN} \
 682                     | ${AWK} -F"|" '{print $4}'| ${AWK} -F"=" '{print $2}')
 683                  [[ "${flag}" == "-n----" ]] && break
 684                  ${SLEEP} 1     
 685               done 
 686            else
 687               rc=1
 688            fi
 689            ${LDM} set-var auto-boot?=false ${DOMAIN} >> $LOGFILE 2>&1 || rc=1
 690         else
 691            rc=1
 692         fi
 693 
 694         debug_message "Function: start_ldom - End"
 695         return ${rc}
 696 }
 697 
 698 start_domain()
 699 {
 700         debug_message "Function: start_domain - Begin"
 701         ${SET_DEBUG}
 702    
 703         typeset rc=0
 704    
 705         # Turn off PMF restart. Starting a domain does not leave
 706         # a running pid as in a classic Solaris Cluster agent.
 707    
 708         START_TIMEOUT=$(${SCHA_RESOURCE_GET} -O START_TIMEOUT \
 709              -R ${RESOURCE} -G ${RESOURCEGROUP} )
 710    
 711         ${SLEEP} ${START_TIMEOUT} &
 712         /usr/cluster/bin/pmfadm -s ${RESOURCEGROUP},${RESOURCE},0.svc
 713    
 714         # Check if the domain exists.
 715         #
 716         # If the domain does not exist, we maybe starting the domain
 717         # on a new cluster node following a failover. As such we will
 718         # define the domain using the previously dumped XML file
 719         # located within the agent's administrative file system.
 720         #
 721         # If the domain already exists, either the domain was manually
 722         # started or the domain was migrated or live migrated from
 723         # another cluster node. Therefore, we will use the already
 724         # defined domain.
 725         #
 726         # Note that when the domain is successfully stopped the domain
 727         # is deleted. We do this simply to avoid the domain from
 728         # being manually started on multiple cluster nodes. See
 729         # domain_delete() for more information.
 730    
 731         if get_${VM}_status > /dev/null 2>&1
 732         then
 733            debug_message "Validate - domain ${DOMAIN} exists"
 734         else
 735            if ${CCRADM} showkey --key xml_${RESOURCE} ${CCR_TABLE} > ${TMP_DIR}/${RESOURCE}.xml 2> /dev/null
 736            then
 737               # add the domain to the cluster node
 738               if add_${VM}_domain ${DOMAIN} ${TMP_DIR}/${RESOURCE}.xml
 739               then
 740                  # SCMSGS
 741                  # @explanation
 742                  # The domain is being defined using a XML file.
 743                  # @user_action
 744                  # None, the domain is being defined using a previously defined
 745                  # XML file when the domain was last successfully started.
 746                  scds_syslog -p daemon.notice -t $(syslog_tag) -m \
 747                       "Domain %s defined using %s/%s.xml." \
 748                       "${DOMAIN}" "${TMP_DIR}" "${RESOURCE}"
 749               else
 750                  # error already logged.
 751                  debug_message "Function: start_domain - End"
 752                  return 1
 753               fi
 754            else
 755               # SCMSGS
 756               # @explanation
 757               # The domain does not exist.
 758               # @user_action
 759               # You must ensure that the domain exists.
 760               scds_syslog -p daemon.error -t $(syslog_tag) -m \
 761                    "Domain %s does not exist." \
 762                    "${DOMAIN}"
 763               
 764               debug_message "Function: start_domain - End"
 765               return 1
 766            fi
 767         fi
 768         
 769         # Tolerate a manually started domain and a NO-OP start
 770         # otherwise start the domain.
 771         
 772         if ${CCRADM} showkey --key noop_${RESOURCE} ${CCR_TABLE} > /dev/null 2>&1
 773         then
 774            # SCMSGS
 775            # @explanation
 776            # The domain was migrated or live migrated.
 777            # @user_action
 778            # None required. Informational message.
 779            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
 780                 "NO-OP START being performed."
 781            
 782            if ! ${CCRADM} delkey --key noop_${RESOURCE} ${CCR_TABLE} >> $LOGFILE 2>&1
 783            then
 784               # SCMSGS
 785               # @explanation
 786               # Failed to delete the NO-OP flag from CCR.
 787               # @user_action
 788               # Check the syslog for further messages.
 789               # Determine why the NO-OP flag was not added to the CCR.
 790               scds_syslog -p daemon.error -t $(syslog_tag) -m \
 791                    "Failed to delete NO-OP flag for %s domain." \
 792                    "${DOMAIN}"
 793               
 794               debug_message "Function: start_domain - End"
 795               return 1
 796            else
 797               debug_message "start_domain - noop_${RESOURCE} deleted"
 798            fi
 799            
 800         elif is_${VM}_up
 801         then
 802            # SCMSGS
 803            # @explanation
 804            # The domain was manually started.
 805            # @user_action
 806            # None required. Informational message.
 807            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
 808                 "Domain %s was manually started." \
 809                 "${DOMAIN}"
 810         else
 811            if start_${VM}
 812            then
 813               # SCMSGS
 814               # @explanation
 815               # The domain was started successfully.
 816               # @user_action
 817               # None required. Informational message.
 818               scds_syslog -p daemon.notice -t $(syslog_tag) -m \
 819                    "Domain %s started." \
 820                    "${DOMAIN}"
 821            else
 822               # SCMSGS
 823               # @explanation
 824               # The domain failed to start.
 825               # @user_action
 826               # Check the syslog for further messages. If possible
 827               # the cluster will attempt to restart the domain.
 828               scds_syslog -p daemon.error -t $(syslog_tag) -m \
 829                    "Domain %s failed to start." \
 830                    "${DOMAIN}"
 831                    
 832               rc=1
 833            fi
 834         fi
 835         
 836         if (( ${rc} == 0 ))
 837         then
 838            # Dump the domain configuration into an XML file. This file is then
 839            # used on another cluster node to define the domain but only if the
 840            # domain does not exist.
 841            
 842            dump_domain_config
 843            rc=${?}
 844         fi
 845         
 846         debug_message "Function: start_domain - End"
 847         return ${rc}
 848 }
 849 
 850 #
 851 # dump the domain configuration
 852 #
 853 dump_xvm_xml()
 854 {
 855         debug_message "Function: dump_xvm_xml - Begin"
 856         ${SET_DEBUG}
 857         
 858         typeset rc=0
 859 
 860         if ! ${VIRSH} dumpxml ${DOMAIN} 2>> $LOGFILE
 861         then
 862            # SCMSGS
 863            # @explanation
 864            # "/usr/bin/virsh dumpxml" for domain failed.
 865            # @user_action
 866            # Determine why the command to dump domain
 867            # configuration failed.
 868            scds_syslog -p daemon.error -t $(syslog_tag) -m \
 869                "%s dumpxml for domain %s failed." \
 870                "${VIRSH}" "${DOMAIN}"
 871             rc=${?}
 872         fi
 873 
 874         debug_message "Function: dump_xvm_xml - End"
 875         return ${rc}
 876 }
 877 
 878 dump_ldom_xml()
 879 {
 880         debug_message "Function: dump_ldom_xml - Begin"
 881         ${SET_DEBUG}
 882         
 883         typeset rc=0
 884 
 885         if ! ${LDM} list-constraints -x ${DOMAIN} 2>> $LOGFILE
 886         then
 887            # SCMSGS
 888            # @explanation
 889            # "/opt/SUNWldm/bin/ldm list-constraints -x"
 890            # for domain failed.
 891            # @user_action
 892            # Determine why the command to list the
 893            # domain constraints failed.
 894            scds_syslog -p daemon.error -t $(syslog_tag) -m \
 895                "%s list-constraints for domain %s failed." \
 896                "${LDM}" "${DOMAIN}"
 897             rc=1
 898         fi
 899    
 900         debug_message "Function: dump_ldom_xml - End"
 901         return ${rc}
 902 }
 903 
 904 #
 905 # save the domain configuration in the cluster
 906 # configuration repository
 907 #
 908 dump_domain_config()
 909 {
 910         debug_message "Function: dump_domain_config - Begin"
 911         ${SET_DEBUG}
 912         
 913         typeset rc=0
 914         
 915         # Dump the domain configuration into an XML file. The domain configuration
 916         # can be changed, when under the the agent control.
 917         
 918         olddesc=$(${CCRADM} showkey --key xml_${RESOURCE} ${CCR_TABLE} 2> /dev/null)
 919         
 920         if (( ${?} == 1 ))
 921         then
 922            #
 923            # The ccr table might not exist.
 924            # create the CCR table, if it doesn't exist.
 925            #
 926            if ${CCRADM} addtab ${CCR_TABLE} >> $LOGFILE 2>&1
 927            then
 928               debug_message "created ccr table ${CCR_TABLE}"
 929            else
 930               # SCMSGS
 931               # @explanation
 932               # Failed to create the CCR table.
 933               # @user_action
 934               # Check the syslog for further messages.
 935               # Determine why the CCR create failed.
 936               scds_syslog -p daemon.error -t $(syslog_tag) -m \
 937                    "Failed to create CCR table %s." \
 938                    "${CCR_TABLE}"
 939               
 940               return 1
 941            fi
 942         fi
 943         
 944         output=$(dump_${VM}_xml)
 945         if (( ${?} == 0 )) && [[ -n "${output}" ]]
 946         then
 947            newdesc=$(echo ${output} | ${TR} -s '\n' '[ ]')
 948            if [ "${olddesc}" != "${newdesc}" ]
 949            then
 950               if ! ${CCRADM} addkey --key=xml_${RESOURCE} --value "${newdesc}" ${CCR_TABLE} > /dev/null 2>&1
 951               then
 952                  if ! ${CCRADM} changekey --key=xml_${RESOURCE} --value "${newdesc}" ${CCR_TABLE} >> $LOGFILE 2>&1
 953                  then
 954                     # SCMSGS
 955                     # @explanation
 956                     # Failed to update the XMl dump to the CCR.
 957                     # @user_action
 958                     # Check the syslog for further messages.
 959                     # Determine why the ccr update failed.
 960                     scds_syslog -p daemon.error -t $(syslog_tag) -m \
 961                          "Failed to update domain XML %s to ccr." \
 962                          "${DOMAIN}"
 963                     
 964                     rc=1
 965                  fi
 966               else
 967                  debug_message "dump_domain_config - %s configuration added to CCR" "${DOMAIN}"
 968               fi
 969            fi
 970         else
 971            # error already logged.
 972            rc=1
 973         fi
 974         
 975         debug_message "Function: dump_domain_config - End"
 976         
 977         return ${rc}
 978 }
 979 
 980 #
 981 # probe function for domain data service
 982 #
 983 check_domain()
 984 {
 985         debug_message "Function: check_domain - Begin"
 986         ${SET_DEBUG}
 987         
 988         typeset rc
 989         SECONDS=0
 990         
 991         if ${PGREP} -f "control_xvm start -R ${RESOURCE} " >/dev/null 2>&1
 992         then
 993            debug_message "Function: check_domain - start program is still running "
 994            rc=100
 995         else
 996            domstate=$(get_${VM}_status 2>/dev/null)
 997            
 998            case "${domstate}" in
 999               
1000               # Acceptable run states
1001               "running"|"blocked"|"paused"|"in shutdown"| \
1002               "active"|"suspending"|"resuming"|"suspended"|"starting")
1003                     
1004                     if [ "${#PLUGIN_PROBE}" -ne 0 ]
1005                     then
1006                        if [ -x "$(echo ${PLUGIN_PROBE} | ${AWK} '{print $1}')" ]
1007                        then
1008                           PROBE_TIMEOUT=$(${SCHA_RESOURCE_GET} -O Extension -R ${RESOURCE} -G ${RESOURCEGROUP} Probe_timeout|tail -1)
1009                           # Run the supplied probe with only 90% of PROBE_TIMEOUT. Also note that this
1010                           # is supplied as a parameter to the PLUGIN_PROBE.
1011                           
1012                           HATIMERUN_TIMEOUT=$((PROBE_TIMEOUT*90/100-${SECONDS}))
1013                           
1014                           output=$(${HATIMERUN} -t ${HATIMERUN_TIMEOUT} -k 9 ${PLUGIN_PROBE} ${HATIMERUN_TIMEOUT})
1015                           rc=${?}
1016                           
1017                           case ${rc} in
1018                              0) debug_message "check_domain - ${DOMAIN} ${output}"
1019                                 rc=0
1020                              ;;
1021                              99)
1022                                 # SCMSGS
1023                                 # @explanation
1024                                 # The domain probe timed out.
1025                                 # @user_action
1026                                 # Ensure that ${PLUGIN_PROBE} can complete within
1027                                 # 90% of PROBE_TIMEOUT.
1028                                 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1029                                      "%s did not complete within %s seconds." \
1030                                      "${PLUGIN_PROBE}" "${HATIMERUN_TIMEOUT}"
1031                                 
1032                                 rc=100
1033                              ;;
1034                              100) if ${PGREP} -f "gds_svc_start .*-R ${RESOURCE} " >/dev/null 2>&1
1035                                 then
1036                                    debug_message "check_domain - ${DOMAIN} is still starting"
1037                                    rc=100
1038                                 elif ${PGREP} -f "gds_svc_stop .*-R ${RESOURCE} " >/dev/null 2>&1
1039                                 then
1040                                    debug_message "check_domain - ${DOMAIN} is stopping"
1041                                    rc=100
1042                                 else
1043                                    # SCMSGS
1044                                    # @explanation
1045                                    # The domain probe has requested a domain restart.
1046                                    # @user_action
1047                                    # None. A domain restart will be attempted.
1048                                    scds_syslog -p daemon.error -t $(syslog_tag) -m \
1049                                         "% has requested a domain restart %s." \
1050                                         "${PLUGIN_PROBE}" "${output}"
1051                                    
1052                                    rc=100
1053                                 fi
1054                              ;;
1055                              201) if ${PGREP} -f "gds_svc_start .*-R ${RESOURCE} " >/dev/null 2>&1
1056                                 then
1057                                    debug_message "check_domain - ${DOMAIN} is still starting"
1058                                    rc=100
1059                                 elif ${PGREP} -f "gds_svc_stop .*-R ${RESOURCE} " >/dev/null 2>&1
1060                                 then
1061                                    debug_message "check_domain - ${DOMAIN} is stopping"
1062                                    rc=100
1063                                 else
1064                                    # SCMSGS
1065                                    # @explanation
1066                                    # The domain has requested an immediate failover.
1067                                    # @user_action
1068                                    # None. The domain will be immediately failed over.
1069                                    scds_syslog -p daemon.error -t $(syslog_tag) -m \
1070                                         "%s has requested an immediate failover." \
1071                                         "${PLUGIN_PROBE}"
1072                                    
1073                                    rc=201
1074                                 fi
1075                              ;;
1076                              *)
1077                                 # SCMSGS
1078                                 # @explanation
1079                                 # ${PLUGIN_PROBE} did not return 0, 100 or 201.
1080                                 # @user_action
1081                                 # None. A domain restart will be attempted.
1082                                 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1083                                      "%s did not return 0, 100 or 201, a domain restart will be attempted." \
1084                                      "${PLUGIN_PROBE}"
1085                                 rc=100
1086                              ;;
1087                           esac
1088                        else
1089                           # SCMSGS
1090                           # @explanation
1091                           # ${PLUGIN_PROBE} does not exist or is not executable.
1092                           # @user_action
1093                           # Check the pathname exists and that ${PLUGIN_PROBE} is executable.
1094                           scds_syslog -p daemon.error -t $(syslog_tag) -m \
1095                                "%s non-existent executable." \
1096                                "${PLUGIN_PROBE}"
1097                           
1098                           rc=0
1099                        fi
1100                     else
1101                        rc=0
1102                     fi
1103                     
1104                  ;;
1105                  
1106                  # Restartable run states
1107                  
1108                  "shut off"|"crashed"| \
1109                  "inactive"|"stopping")
1110                        
1111                        rc=100
1112                     ;;
1113                     
1114                     # Unknown run states
1115                     
1116                     *)
1117                        rc=100
1118                     ;;
1119            esac
1120            
1121            debug_message "check_domain - ${DOMAIN} ${domstate}"
1122            
1123         fi
1124         
1125         debug_message "Function: check_domain - End"
1126         return ${rc}
1127 }
1128 
1129 stop_domain()
1130 {
1131         debug_message "Function: stop_domain - Begin"
1132         ${SET_DEBUG}
1133         
1134         typeset rc=0
1135         
1136         STOP_TIMEOUT=$(${SCHA_RESOURCE_GET} -O STOP_TIMEOUT \
1137         -R ${RESOURCE} -G ${RESOURCEGROUP} )
1138         
1139         # Note that GDS will attempt to cleanup after 80% of STOP_TIMEOUT
1140         # has been consumed.  In this regard, we only allocate a combined
1141         # 75% of STOP_TIMEOUT to MAX_MIGRATE_TIMEOUT and MAX_STOP_TIMEOUT.
1142         #
1143         # This leaves 5% for domain_destroy() which maybe called if
1144         # domain_shutdown() exeecds it's timeout and finally domain_delete().
1145         
1146         MAX_MIGRATE_TIMEOUT=$((STOP_TIMEOUT*25/100))
1147         MAX_STOP_TIMEOUT=$((STOP_TIMEOUT*50/100))
1148         SECONDS=0
1149         
1150         # Save the domain configuration changes.
1151         if ! dump_domain_config
1152         then
1153            debug_message "Function: stop_domain - End"
1154            return 1
1155         fi
1156 
1157         # At resource creation, the administrator can determine the Migration_type.
1158         # Valid values for Migration_type are
1159         #
1160         # Migration_type="normal"
1161         #   o Stop the resource (shutdown the domain)
1162         #   o Failover the resource group from the source node to the target node
1163         #   o Start the resource (start the domain)
1164         #
1165         # Migration_type="migrate"
1166         #   o Suspend the domain on the source node
1167         #   o Copy the domain's memory pages from the source node to the target node
1168         #   o Resume the domain on the target node
1169         #
1170         # Migration_type="migrate_live"
1171         #   o Iteratively copy the domain's memory pages from the source node to the taregt node
1172         #   o When pre-copy is no longer benefical, suspend the domain on the source node
1173         #   o Copy the domain's remaning "dirty" pages from the source node to the taregt node
1174         #   o Resume the domain on the target node
1175         #
1176         # Note that migration or live migration is performed over the cluster interconnect.
1177         #
1178         # For migration or live migration to be attempted across Solaris Cluster xVM nodes
1179         # the following conditions must be met.
1180         #
1181         # - The target Solaris Cluster xVM node must be running the same xVM version.
1182         #
1183         # - The migration TCP port must be open and accepting connections from the source
1184         #    Solaris Cluster xVM node.
1185         #
1186         # - There must be sufficient resources for the domain to run in.
1187         #
1188         # - If the conditions are met and migration or live migration is successful a NO-OP
1189         # STOP and START is performed. This will ensure a successful STOP and START to the
1190         # appropriate RGM callback methods. Furthermore, doing a NO-OP RGM failover will
1191         # ensure that RGM subsequently actions any dependencies and that Solaris Cluster
1192         # reflects the correct state and status of resource groups and resources.
1193         #
1194         # - If the conditions are met but migration or live migration is not successful a
1195         # normal failover will be performed.
1196         #
1197         # - If the conditions are not met, migration or live migration will fail and a normal
1198         # failover will be performed.
1199         #
1200         # However, before attempting a migration or live migration we need to determine if the
1201         # resource is being disabled. To distinguish if the resource is being disabled we
1202         # test the ON_OFF_SWITCH property of the resource.
1203         #
1204         # If the resource is being disabled the ON_OFF_SWITCH will be DISABLED before the STOP
1205         # method is called. So, conversely if the ON_OFF_SWITCH is ENABLED the resource is not
1206         # being disabled and instead the resource group is undergoing either a switch to
1207         # another node or is being evacuated from the node.
1208         #
1209         # - If the resource is being disabled we perform a normal shutdown, regardless of the
1210         # Migration_type setting.
1211         
1212         ON_OFF_SWITCH=$(${SCHA_RESOURCE_GET} -O ON_OFF_SWITCH -R ${RESOURCE} -G ${RESOURCEGROUP})
1213         
1214         debug_message "stop_domain - ON_OFF_SWITCH=${ON_OFF_SWITCH}"
1215         debug_message "stop_domain - MIGRATION_TYPE=${MIGRATION_TYPE}"
1216         
1217         if [[ "${ON_OFF_SWITCH}" = "DISABLED" ]]
1218         then
1219            domain_shutdown
1220         else
1221            case "${MIGRATION_TYPE}" in
1222               NORMAL)   domain_shutdown
1223                         rc=${?}
1224                      ;;
1225               MIGRATE*) if ! domain_migrate
1226                         then
1227                            domain_shutdown
1228                         fi
1229                         rc=${?}
1230                      ;;
1231                      *)
1232                         # SCMSGS
1233                         # @explanation
1234                         # Invalid Migration_type specified.
1235                         # @user_action
1236                         # Delete and reregister the resource with
1237                         # a valid Migration_type entry.
1238                         scds_syslog -p daemon.error -t $(syslog_tag) -m \
1239                              "Invalid Migration_type=%s." \
1240                              "${MIGRATION_TYPE}"
1241                         rc=1
1242                      ;;
1243            esac
1244         fi
1245         
1246         debug_message "Function: stop_domain - End"
1247         return ${rc}
1248 }
1249      
1250 get_target_host()
1251 {
1252         debug_message "Function: get_target_host - Begin"
1253         ${SET_DEBUG}
1254         
1255         typeset rc=1
1256         
1257         # Here, we need to determine the target host as the resource group is either being
1258         # switched or the node, where the resoure group is online, is being evacuated.
1259         #
1260         # To determine the target host for a resource group switch we rely on the cluster
1261         # command log file /var/cluster/logs/commandlog to supply the target host. We need to
1262         # obtain the correct entry from the command log file and match against the following
1263         #
1264         #       <date> + ${RESOURCEGROUP} + "START" + "switch"
1265         #
1266         # after which we only save the nodename from a clrg or scswitch command.
1267         #
1268         # Sample /var/cluster/log/commandlog output is as follows,
1269         #
1270         # 02/07/2008 08:45:13 pelko1 10548 root START - scswitch -z -g "xvm2-rg" -h "pelko2"
1271         # 02/07/2008 08:45:38 pelko1 10548 root END 0
1272         # 02/07/2008 09:01:35 pelko1 10874 root START - clrg "switch" -n "pelko1" "xvm2-rg"
1273         # 02/07/2008 09:01:36 pelko1 10874 root END -20827641
1274         #
1275         # If we are unable to match an entry, as perhaps the entry was logged at <date>
1276         # and we are checking at <date> + 1 second, i.e. we are checking just as the second
1277         # entry is incrementing to the next second, we perform another check. In fact the
1278         # last 10 seconds are checked from the commandlog.
1279         #
1280         # Once we have matched an entry from /var/cluster/logs/commandlog, we verify that
1281         # the target host is a valid nodelist entry for the resource group.
1282         #
1283         # - If we have a valid nodelist entry we then determine that target host's cluster
1284         # interconnect hostname to perform the migration or live migration.
1285         #
1286         # - If we are unable to find a match for a switch, we need to consider that an evacuate
1287         # node is being performed. However, if the node is being evacuated we will rely on
1288         # RGM to dertermine the nodename regardless if a mirgation or live migration was
1289         # requested. Subsequently, we perform a normal failover. This ensures that we do not
1290         # migrate or live migrate the domain to a node that maybe different to the node
1291         # selected by RGM.
1292         #
1293         # So, suffice to say that if a "switch" match is not found, following the discovery
1294         # that the resource is not just being disabled, and that a migrate or live migrate
1295         # was defined, we will always perform a normal failover.
1296         #
1297         # Note that the target host match is performed within check_commandlog().
1298         
1299         check_commandlog
1300         
1301         debug_message "get_target_host - ${TARGET_HOST} size=${#TARGET_HOST}"
1302         
1303         if [ "${#TARGET_HOST}" -eq 0 ]
1304         then
1305            # SCMSGS
1306            # @explanation
1307            # A target host was not found
1308            # @user_action
1309            # None required. The domain will not be migrated or live
1310            # migrated instead a normal failover will be performed.
1311            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1312                 "Target host not found, normal failover will be performed."
1313            
1314         elif [ ${TARGET_HOST} = "$(/usr/bin/uname -n)" ] || [ $(echo ${TARGET_HOST} | /usr/bin/grep [0-9]:global) ]
1315         then
1316            # SCMSGS
1317            # @explanation
1318            # The node is being evecuated.
1319            # @user_action
1320            # None required. The domain will not be migrated or live
1321            # migrated. Instead, a normal failover will be performed.
1322            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1323                 "Node is being evacuated, normal failover will be performed."
1324            
1325         else
1326            for i in $(${SCHA_RESOURCEGROUP_GET} -O NODELIST -G ${RESOURCEGROUP})
1327            do
1328               [[ "${i}" != "$(uname -n)" || "${i}" = "${TARGET_HOST}" ]] && rc=0 && break
1329            done
1330            
1331            if [ "${rc}" -eq 0 ]
1332            then
1333               PRIVATELINK_TARGET_HOST=$(${SCHA_CLUSTER_GET} -O PRIVATELINK_HOSTNAME_NODE ${TARGET_HOST})
1334               debug_message "get_target_host - PRIVATELINK_TARGET_HOST=${PRIVATELINK_TARGET_HOST}"
1335            else
1336               # SCMSGS
1337               # @explanation
1338               # The target host found in the command log file is not
1339               # a valid entry within the resource groups nodelist.
1340               # @user_action
1341               # None required. The domain will not be migrated or live
1342               # migrated instead a normal failover will be performed.
1343               scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1344                    "Target host %s not matched with the resource group nodelist, normal failover will be performed." \
1345                    "${TARGET_HOST}"
1346            fi
1347         fi
1348         
1349         debug_message "Function: get_target_host - End"
1350         return ${rc}
1351 }
1352      
1353 check_commandlog()
1354 {
1355         debug_message "Function: check_commandlog - Begin"
1356         
1357         # Get the current epoch time
1358         typeset ETIME=$(/usr/bin/perl -e 'print time;')
1359         typeset DATE=$(/usr/bin/date '+%m/%d/%Y')
1360         i=10
1361         
1362         while (( $i > 0 ))
1363         do
1364            # Iteratively search the commandlog for a switch or evacuate, going back in time
1365            # by one second each time. If a match is found we break out of the loop.
1366            #
1367            # The following may help to understand the iterative loop.
1368            #
1369            # bash-3.2# ETIME=$(perl -e 'print time;')
1370            # bash-3.2# echo $ETIME
1371            # 1202814041
1372            # bash-3.2# HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | awk '{print $4}')
1373            # bash-3.2# echo $HHMMSS
1374            # 03:00:41
1375            # bash-3.2# ETIME=$(expr ${ETIME} - 1)
1376            # bash-3.2# echo $ETIME
1377            # 1202814040
1378            # bash-3.2# HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | awk '{print $4}')
1379            # bash-3.2# echo $HHMMSS
1380            # 03:00:40
1381            # bash-3.2#
1382            
1383            # Convert the epoch time into a readable format
1384            HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | ${AWK} '{print $4}')
1385            
1386            debug_message "check_commadlog - performed for ${DATE} ${HHMMSS}"
1387            
1388            # Check for a clrg switch or scswitch
1389            TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
1390               /usr/bin/grep -w START | /usr/bin/grep switch | /usr/bin/grep \"${RESOURCEGROUP}\" |\
1391            /usr/bin/sed -e 's/^.*-h //' -e 's/^.*-n //' | ${AWK} '{print $1}' | ${TR} -d '" ')
1392            
1393            [ "${#TARGET_HOST}" -ne 0 ] && break
1394            
1395            # Check for a clrg evacuate
1396            TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
1397               /usr/bin/grep -w START | /usr/bin/grep evacuate |\
1398            /usr/bin/sed -e 's/^.*-n //' | ${AWK} '{print $1}' | ${TR} -d '+" ' )
1399            
1400            [ "${#TARGET_HOST}" -ne 0 ] && break
1401            
1402            # Check for a scswitch -S
1403            TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
1404               /usr/bin/grep -w START | /usr/bin/grep scswitch | /usr/bin/grep "\-S" |\
1405            /usr/bin/sed -e 's/^.*-h //' | ${AWK} '{print $1}' | ${TR} -d '\-SK" ' )
1406            
1407            [ "${#TARGET_HOST}" -ne 0 ] && break
1408            
1409            i=$(expr $i - 1)
1410            ETIME=$(expr ${ETIME} - 1)
1411         done
1412         
1413         debug_message "check_commandlog - TARGET_HOST=${TARGET_HOST}"
1414    
1415         debug_message "Function: check_commandlog - End"
1416 }
1417 
1418 #
1419 # routines to perform domain migration
1420 #
1421 migrate_xvm()
1422 {
1423         debug_message "Function: migrate_xvm - Begin"
1424         ${SET_DEBUG}
1425         
1426         typeset rc=0
1427 
1428         [[ "${MIGRATION_TYPE}" = "MIGRATE" ]] && OPTION="migrate"
1429         [[ "${MIGRATION_TYPE}" = "MIGRATE_LIVE" ]] && OPTION="migrate --live"
1430         
1431         debug_message "domain_migrate - Running /usr/sbin/xm ${OPTION} ${DOMAIN} ${PRIVATELINK_TARGET_HOST}"
1432         
1433         ${HATIMERUN} -t ${MAX_MIGRATE_TIMEOUT} -k KILL \
1434             ${XM} ${MIGRATION_TYPE} "${DOMAIN}" ${PRIVATELINK_TARGET_HOST} > /dev/null 2>&1
1435         rc=${?}
1436         
1437         debug_message "Function: migrate_xvm - End"
1438         return ${rc}
1439 }
1440 
1441 migrate_ldom()
1442 {
1443         debug_message "Function: migrate_ldom - Begin"
1444         ${SET_DEBUG}
1445         
1446         typeset rc=0
1447 
1448         [[ "${MIGRATION_TYPE}" = "MIGRATE" ]] && OPTION="migrate"
1449         
1450         debug_message "domain_migrate - Running /opt/SUNWscxvm/bin/ldm_migrate ${OPTION} ${DOMAIN} ${PRIVATELINK_TARGET_HOST}"
1451         
1452         ${HATIMERUN} -t ${MAX_MIGRATE_TIMEOUT} -k KILL \
1453             /opt/SUNWscxvm/bin/ldm_migrate ${OPTION} "${DOMAIN}" ${PRIVATELINK_TARGET_HOST} ${PASSWORD_FILE} >> $LOGFILE 2>&1
1454         rc=${?}
1455         
1456         debug_message "Function: migrate_ldom - End"
1457         return ${rc}
1458 }
1459 
1460 #
1461 # routines to cancel migration
1462 #
1463 cancel_xvm_migration()
1464 {
1465         # NO OP for a xvm domain
1466         return 0
1467 }
1468 
1469 cancel_ldom_migration()
1470 {
1471         debug_message "Function: cancel_ldom_migration - Begin"
1472         ${SET_DEBUG}
1473 
1474         # cancel domain migration for ldoms
1475         if ${LDM} cancel-operation migration ${DOMAIN} >> $LOGFILE 2>&1
1476         then
1477            # SCMSGS
1478            # @explanation
1479            # The domain migration operation was cancelled.
1480            # @user_action
1481            # None required. Informational message.
1482            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1483                 "Migration of domain %s is cancelled, the domain state is now in active state." \
1484                 "${DOMAIN}"
1485         fi
1486         
1487         while (( ${SECONDS} < ${MAX_STOP_TIMEOUT} ))
1488         do
1489            if get_${VM}_status | ${GREP} -q -E "^suspending|^resuming|^suspended|^starting" > /dev/null 2>&1
1490            then
1491               sleep 5
1492            else
1493               SECONDS=${MAX_STOP_TIMEOUT}
1494            fi
1495         done
1496         
1497         debug_message "Function: cancel_ldom_migration - End"
1498 }
1499 
1500 domain_migrate()
1501 {
1502         debug_message "Function: domain_migrate - Begin"
1503         ${SET_DEBUG}
1504         
1505         typeset rc
1506         
1507         [[ "${MIGRATION_TYPE}" = "MIGRATE" ]] && MSG="migrated"
1508         [[ "${MIGRATION_TYPE}" = "MIGRATE_LIVE" ]] && MSG="live migrated"
1509         
1510         if get_target_host
1511         then
1512            # SCMSGS
1513            # @explanation
1514            # The domain is being migrated or live migrated to the target host.
1515            # @user_action
1516            # None required.
1517            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1518                 "Domain %s is being %s to %s." \
1519                 "${DOMAIN}" "${MSG}" "${TARGET_HOST}"
1520            
1521            migrate_${VM} ${MIGRATION_TYPE} ${DOMAIN} ${PRIVATELINK_TARGET_HOST}
1522            rc=${?}
1523            
1524            if (( ${rc} == 0 ))
1525            then
1526               # SCMSGS
1527               # @explanation
1528               # The domain was migrated or live migrated to the target host.
1529               # @user_action
1530               # None required. The domain successfully migrated or live migrated
1531               # from the source node to the target node.
1532               scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1533                    "Domain %s successfully %s to %s." \
1534                    "${DOMAIN}" "${MSG}" "${TARGET_HOST}"
1535               
1536               # As the domain has been successfully migrated or live migrated
1537               # we need to indicate a successful stop by performing a NO-OP stop
1538               # and subsequently a successful start by performing a NO-OP start.
1539               
1540               if ${CCRADM} addkey --key=noop_${RESOURCE} --value="1" ${CCR_TABLE} >> $LOGFILE 2>&1
1541               then
1542                  debug_message "domain_migrate - .noop_${RESOURCE} flag added to CCR"
1543               else
1544                  # SCMSGS
1545                  # @explanation
1546                  # Failed to update the XMl configuration to the CCR.
1547                  # @user_action
1548                  # Check the syslog for further messages.
1549                  # Determine why the ccr update failed.
1550                  scds_syslog -p daemon.error -t $(syslog_tag) -m \
1551                       "Failed to add NO-OP flag for %s to ccr." \
1552                       "${DOMAIN}"
1553                  rc=1
1554               fi
1555               
1556               # SCMSGS
1557               # @explanation
1558               # The domain was migrated or live migrated.
1559               # @user_action
1560               # None required. Informational message.
1561               scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1562                    "NO-OP STOP being performed."
1563                    
1564            elif (( ${rc} == 99 ))
1565            then
1566               # SCMSGS
1567               # @explanation
1568               # The domain migration or live migration timed out.
1569               # @user_action
1570               # None required. Informational message.
1571               scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1572                    "Migration of domain %s timed out, the domain state is now shut off." \
1573                    "${DOMAIN}"
1574               
1575               rc=1
1576               cancel_${VM}_migration
1577            else
1578               # SCMSGS
1579               # @explanation
1580               # The domain failed to migrate or live migrate to the target host.
1581               # @user_action
1582               # None required. The domain failed to migrate or live migrate
1583               # from the source node to the target node. A normal failover
1584               # will be performed.
1585               scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1586                    "Domain %s failed to %s to %s, normal failover will be performed." \
1587                    "${DOMAIN}" "${MSG}" "${TARGET_HOST}"
1588               
1589               rc=1
1590               cancel_${VM}_migration
1591            fi
1592         else
1593            rc=1
1594         fi
1595         
1596         # If the domain has successfully migrated, we will now delete the domain.
1597         #
1598         # Doing this ensures that the domain is only defined and able to be started
1599         # on one cluster node at a time. Domains can use shared storage between cluster
1600         # nodes so it is very important that we prevent any data corruption if a domain
1601         # gets manually started on multiple cluster nodes where shared storage is used.
1602         #
1603         # Of course using SUNW.HAStoragePlus somewhat protects against this, however we
1604         # simply want to avoid any manual administrative errors performed by mistake.
1605         #
1606         # Note, unless the domain was migrated or live migrated, the domain is defined
1607         # before startup using a previously dumped XML file for the administrative file
1608         # system.
1609         
1610         (( ${rc} == 0 )) && [[ "${VM}" == "xvm" ]] && domain_delete
1611         
1612         debug_message "Function: domain_migrate - End"
1613         return ${rc}
1614 }
1615 
1616 #
1617 # routines to perform domain shutdown
1618 #
1619 shutdown_xvm()
1620 {
1621         debug_message "Function: shutdown_xvm - Begin"
1622         ${SET_DEBUG}
1623         
1624         typeset rc=0
1625 
1626         # Note that the virsh shutdown command returns before the domain
1627         # has shutdown, as such we do not use hatimerun.
1628         
1629         ${VIRSH} shutdown ${DOMAIN} > /dev/null 2>&1
1630         rc=${?}
1631         
1632         debug_message "Function: shutdown_xvm - End"
1633         return ${rc}
1634 }
1635      
1636 shutdown_ldom()
1637 {
1638         debug_message "Function: shutdown_ldom - Begin"
1639         ${SET_DEBUG}
1640 
1641         typeset rc
1642        
1643         status=$(get_${VM}_status) 
1644         if (( ${?} == 0 ))
1645         then
1646            if echo ${status} | ${GREP} -q -E "^active$|^suspending|^resuming|^suspended|^starting" > /dev/null 2>&1
1647            then
1648               ${HATIMERUN} -t ${MAX_STOP_TIMEOUT} -k KILL ${LDM} stop-domain ${DOMAIN} >> $LOGFILE 2>&1
1649               rc=${?}
1650            else
1651               # domain is already stopped
1652               rc=0
1653            fi
1654         else
1655            # domain is not present.
1656            rc=2
1657         fi
1658         
1659         debug_message "Function: shutdown_ldom - Begin"
1660         return ${rc}
1661 }
1662 
1663 domain_shutdown()
1664 {
1665         debug_message "Function: domain_shutdown - Begin"
1666         ${SET_DEBUG}
1667         
1668         typeset rc
1669         
1670         # Corordinate with the domain OS to perform a graceful shutdown.
1671         # Note that the virsh shutdown command returns before the domain
1672         # has shutdown, as such we do not use hatimerun.
1673         
1674         shutdown_${VM}
1675         rc=${?}
1676         if (( ${rc} == 2 ))
1677         then
1678                 debug_message "Function: domain_shutdown - End"
1679                 return 0
1680         elif (( ${rc} == 0 ))
1681         then 
1682            # Loop to test if the domain shuts down gracefully
1683            # or if the shutdown time is exceeded.
1684            
1685            while (( ${SECONDS} < ${MAX_STOP_TIMEOUT} ))
1686            do
1687               if is_${VM}_up
1688               then
1689                  sleep 5
1690               else
1691                  SECONDS=${MAX_STOP_TIMEOUT}
1692               fi
1693            done
1694            
1695            if is_${VM}_up
1696            then
1697               # SCMSGS
1698               # @explanation
1699               # The domain failed to shutdown gracefully.
1700               # @user_action
1701               # None required. The domain failed to shutdown
1702               # gracefully and will now be immediately terminated.
1703               scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1704                    "Domain %s failed to shutdown gracefully, immediate shutdown will now be performed." \
1705                    "${DOMAIN}"
1706               
1707               destroy_${VM}
1708               rc=${?}
1709            else
1710               # SCMSGS
1711               # @explanation
1712               # The domain was shutdown gracefully.
1713               # @user_action
1714               # None required. The domain has shutdown gracefully.
1715               scds_syslog -p daemon.info -t $(syslog_tag) -m \
1716                    "Domain %s has been gracefully shutdown." \
1717                    "${DOMAIN}"
1718               rc=0
1719            fi
1720            
1721         else
1722            # error already logged
1723            destroy_${VM}
1724            rc=${?}
1725         fi
1726         
1727         # If the domain has successfully shutdown, we will now delete the domain.
1728         #
1729         # Doing this ensures that the domain is only defined and able to be started
1730         # on one cluster node at a time. Domains can use shared storage between cluster
1731         # nodes so it is very important that we prevent any data corruption if a domain
1732         # gets manually started on multiple cluster nodes where shared storage is used.
1733         #
1734         # Of course using SUNW.HAStoragePlus somewhat protects against this, however we
1735         # simply want to avoid any manual administrative errors performed by mistake.
1736         #
1737         # Note, unless the domain was migrated or live migrated, the domain is defined
1738         # before startup using a previously dumped XML file for the administrative file
1739         # system.
1740         
1741         (( ${rc} == 0 )) && domain_delete
1742         
1743         debug_message "Function: domain_shutdown - End"
1744         return ${rc}
1745 }
1746 
1747 #
1748 # routines to destroy domain
1749 #
1750 destroy_xvm()
1751 {
1752         debug_message "Function: destroy_xvm - Begin"
1753         ${SET_DEBUG}
1754 
1755         typeset rc
1756         
1757         if ${VIRSH} destroy ${DOMAIN} >> $LOGFILE 2>&1
1758         then
1759            # SCMSGS
1760            # @explanation
1761            # The domain was immediately terminated.
1762            # @user_action
1763            # None required. The domain had previously failed to shutdown
1764            # gracefully but has now been immediately terminated.
1765            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1766                 "Domain %s has been immediately terminated." \
1767                 "${DOMAIN}"
1768            rc=0
1769         else
1770            # SCMSGS
1771            # @explanation
1772            # The /usr/bin/virsh destroy command failed.
1773            # @user_action
1774            # Determine why it was not possible to immediately terminate
1775            # the domain.
1776            scds_syslog -p daemon.error -t $(syslog_tag) -m \
1777                 "Domain %s failed to shutdown immediately." \
1778                 "${DOMAIN}"
1779            rc=1
1780         fi
1781         
1782         debug_message "Function: destroy_xvm - End"
1783         return ${rc}
1784 }
1785      
1786 destroy_ldom()
1787 {
1788         debug_message "Function: destroy_ldom - Begin"
1789         ${SET_DEBUG}
1790 
1791         typeset rc
1792         
1793         if ${LDM} stop-domain -f ${DOMAIN} >> $LOGFILE 2>&1
1794         then
1795            # SCMSGS
1796            # @explanation
1797            # The domain was immediately terminated.
1798            # @user_action
1799            # None required. The domain had previously failed to shutdown
1800            # gracefully but has now been immediately terminated.
1801            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1802                 "Domain %s has been forcefully terminated." \
1803                 "${DOMAIN}"
1804            rc=0
1805         else
1806            # SCMSGS
1807            # @explanation
1808            # The /opt/SUNWldm/bin/ldm stop-domain "-f" command failed.
1809            # @user_action
1810            # Determine why it was not possible to forcefully stop
1811            # the domain.
1812            scds_syslog -p daemon.error -t $(syslog_tag) -m \
1813                 "Domain %s failed to do a forceful shutdown." \
1814                 "${DOMAIN}"
1815            rc=1
1816         fi
1817         
1818         debug_message "Function: destroy_ldom - End"
1819         return ${rc}
1820 }
1821 
1822 #
1823 # routines to remove domains from the node
1824 #
1825 domain_delete()
1826 {
1827         debug_message "Function: domain_delete - Begin"
1828         ${SET_DEBUG}
1829         
1830         # The purpose of deleting the domain after shutdown is to avoid the possibility of
1831         # someone manually starting the domain on a different node. Doing so would compromise
1832         # the domain if shared storage was used for the domain. The domain's configuration
1833         # is always dumped to the agent's administrative file system so that the domain can
1834         # be defined before startup.
1835         
1836         typeset rc
1837         
1838         if delete_${VM}
1839         then
1840            # SCMSGS
1841            # @explanation
1842            # The domain was deleted.
1843            # @user_action
1844            # None required. The domain has been deleted as it
1845            # will be defined on another node. Deleting the domain
1846            # on this node ensures that it can't be started on
1847            # more than one cluster node at a time.
1848            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1849                 "Domain %s has been deleted on this node." \
1850                 "${DOMAIN}"
1851            rc=0
1852         else
1853            # error already logged.
1854            rc=1
1855         fi
1856         
1857         debug_message "Function: domain_delete - End"
1858         return ${rc}
1859 }
1860 
1861 delete_xvm()
1862 {
1863         debug_message "Function: delete_xvm - Begin"
1864         ${SET_DEBUG}
1865         
1866         typeset rc=0
1867 
1868         if ! /usr/sbin/xm delete ${DOMAIN} >> $LOGFILE 2>&1
1869         then
1870            # SCMSGS
1871            # @explanation
1872            # The /usr/sbin/xm delete command failed.
1873            # @user_action
1874            # Determine why it was not possible to delete the domain.
1875            scds_syslog -p daemon.error -t $(syslog_tag) -m \
1876                 "Failed to delete domain %s on this node." \
1877                 "${DOMAIN}"
1878            rc=1
1879         fi
1880         
1881         debug_message "Function: delete_xvm - End"
1882         return ${rc}
1883 }
1884 
1885 delete_ldom()
1886 {
1887         debug_message "Function: delete_ldom - Begin"
1888         ${SET_DEBUG}
1889         
1890         if get_${VM}_status | ${GREP} -q -E "^bound$" > /dev/null 2>&1
1891         then
1892            
1893            # if the domain is in bound state, unbind it.
1894            ${LDM} unbind-domain ${DOMAIN} >> $LOGFILE 2>&1
1895            
1896            if (( ${?} != 0 ))
1897            then
1898               # SCMSGS
1899               # @explanation
1900               # The /opt/SUNWldm/bin/ldm unbind-domain command failed.
1901               # @user_action
1902               # Determine why it was not possible to unbind the domain.
1903               scds_syslog -p daemon.error -t $(syslog_tag) -m \
1904                    "Failed to unbind domain %s on this node." \
1905                    "${DOMAIN}"
1906               
1907               debug_message "Function: delete_ldom - End"
1908               return 1
1909            fi
1910         fi
1911         
1912         if ! ${LDM} remove-domain ${DOMAIN} >> $LOGFILE 2>&1
1913         then
1914            # SCMSGS
1915            # @explanation
1916            # The /opt/SUNWldm/bin/ldm remove-domain command failed.
1917            # @user_action
1918            # Determine why it was not possible to remove the domain.
1919            scds_syslog -p daemon.error -t $(syslog_tag) -m \
1920                 "Failed to remove domain %s on this node." \
1921                 "${DOMAIN}"
1922 
1923            debug_message "Function: delete_ldom - End"
1924            return 1
1925         fi
1926         
1927         debug_message "Function: delete_ldom - End"
1928         return 0
1929 }