New functions.ksh
  1 #
  2 # CDDL HEADER START
  3 #
  4 # The contents of this file are subject to the terms of the
  5 # Common Development and Distribution License (the License).
  6 # You may not use this file except in compliance with the License.
  7 #
  8 # You can obtain a copy of the license at usr/src/CDDL.txt
  9 # or http://www.opensolaris.org/os/licensing.
 10 # See the License for the specific language governing permissions
 11 # and limitations under the License.
 12 #
 13 # When distributing Covered Code, include this CDDL HEADER in each
 14 # file and include the License file at usr/src/CDDL.txt.
 15 # If applicable, add the following below this CDDL HEADER, with the
 16 # fields enclosed by brackets [] replaced with your own identifying
 17 # information: Portions Copyright [yyyy] [name of copyright owner]
 18 #
 19 # CDDL HEADER END
 20 #
 21 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 22 # Use is subject to license terms.
 23 #
 24 #ident   "@(#)functions.ksh 1.1  08/11/19 SMI"
 25 #
 26 
 27 PKG=SUNWscxvm
 28 TASK_COMMAND=""
 29 RESOURCE_PROJECT_NAME=""
 30 SCLOGGER=/usr/cluster/lib/sc/scds_syslog
 31 LOGGER=/usr/bin/logger
 32 
 33 syslog_tag()
 34 {
 35         ${SET_DEBUG}
 36         print "SC[${PKG:-??}.${METHOD:-??}]:${RESOURCEGROUP:-??}:${RESOURCE:-??}"
 37 }
 38 
 39 scds_syslog()
 40 {
 41         if [ -f "${SCLOGGER}" ]
 42         then
 43            ${SCLOGGER} "$@" &
 44         else
 45            while getopts 'p:t:m' opt
 46            do
 47                 case "${opt}" in
 48                    t) TAG=${OPTARG};;
 49                    p) PRI=${OPTARG};;
 50                 esac
 51            done
 52         
 53            shift $((${OPTIND} - 1))
 54            LOG_STRING=$(/usr/bin/printf "$@")
 55            ${LOGGER} -p ${PRI} -t ${TAG} ${LOG_STRING}
 56         fi
 57 }
 58 
 59 debug_message()
 60 {
 61         if [ "${DEBUG}" = "${RESOURCE}" -o "${DEBUG}" = "ALL" ]
 62         then
 63            SET_DEBUG="set -x"
 64            DEBUG_TEXT=${1}
 65 
 66            scds_syslog -p daemon.debug -t $(syslog_tag) -m \
 67               "%s" "${DEBUG_TEXT}"
 68         else
 69            SET_DEBUG=
 70         fi
 71 }
 72 
 73 validate()
 74 {
 75         debug_message "Function: validate - Begin"
 76         ${SET_DEBUG}
 77 
 78         if [ "$(/usr/bin/uname -i)" != "i86xpv" ] 
 79         then
 80            # SCMSGS
 81            # @explanation
 82            # Solaris is not booted with xVM.
 83            # @user_action
 84            # Ensure that the default boot grub menu is set to boot
 85            # Solaris xVM.
 86            scds_syslog -p daemon.error -t $(syslog_tag) -m \
 87                 "Node is not booted with xVM"
 88 
 89            return 1
 90         fi
 91 
 92         if [[ "${FAILOVER_TYPE}" != "normal" && "${FAILOVER_TYPE}" != "migrate" && "${FAILOVER_TYPE}" != "migrate --live" ]]
 93         then
 94            # SCMSGS
 95            # @explanation
 96            # Incorrect Failover_type specified.
 97            # @user_action
 98            # Ensure that Failover_type="normal"|"migrate"|"migrate --live" 
 99            # is specified within /opt/SUNWscxvm/util/xvm_config.
100            scds_syslog -p daemon.error -t $(syslog_tag) -m \
101                 "Failover_type %s is not valid" \
102                 "${FAILOVER_TYPE}"
103 
104            return 1
105         fi
106 
107         if [ ! -d "${ADMIN}" ]
108         then
109            # SCMSGS
110            # @explanation
111            # The agent administrative pathname does not exist.
112            # @user_action
113            # Ensure that the agent administrative pathname is specified
114            # within /opt/SUNWscxvm/util/xvm_config.
115            scds_syslog -p daemon.error -t $(syslog_tag) -m \
116                 "Administrative pathname not found"
117 
118            return 1
119         fi
120 
121         debug_message "Function: validate - End"
122         return 0
123 }
124 
125 start_domain()
126 {
127         debug_message "Function: start_domain - Begin"
128         ${SET_DEBUG}
129 
130         typeset rc=0
131 
132         # Turn off PMF restart. Starting a domain does not leave
133         # a running pid as in a classic Solaris Cluster agent.
134 
135         START_TIMEOUT=$(/usr/cluster/bin/scha_resource_get -O START_TIMEOUT \
136            -R ${RESOURCE} -G ${RESOURCEGROUP} )
137 
138         sleep ${START_TIMEOUT} &
139         /usr/cluster/bin/pmfadm -s ${RESOURCEGROUP},${RESOURCE},0.svc
140 
141         # Check if the domain exists.
142         #
143         # If the domain does not exist, we maybe starting the domain
144         # on a new cluster node following a failover. As such we will
145         # define the domain using the previously dumped XML file
146         # located within the agent's administraative file system.
147         #
148         # If the domain does exist either the domain was manually
149         # started or the domain was migrated or live migrated from 
150         # another cluster node. As such we will use the already 
151         # defined domain.
152         #
153         # Note that when the domain is successfully stopped the domain
154         # is deleted. We do this simply to avoid the domain from 
155         # being manually started on multiple cluster nodes. See
156         # domain_delete() for more information.
157 
158         if /usr/bin/virsh dominfo ${DOMAIN} > /dev/null 2>&1
159         then
160            debug_message "Validate - domain ${DOMAIN} exists"
161         else
162            if [ -f ${ADMIN}/${RESOURCE}.xml ]
163            then
164                 if /usr/bin/virsh define ${ADMIN}/${RESOURCE}.xml > /dev/null 2>&1
165                 then
166                    # SCMSGS
167                    # @explanation
168                    # The domain is being defined using a XML file.
169                    # @user_action
170                    # None, the domain is being defined using a previously defined
171                    # XML file when the domain was last successfully started.
172                    scds_syslog -p daemon.notice -t $(syslog_tag) -m \
173                         "Domain %s defined using %s/%s.xml" \
174                         "${DOMAIN}" "${ADMIN}" "${RESOURCE}"
175                 else
176                    # SCMSGS
177                    # @explanation
178                    # The domain failed to be defined using a XML file.
179                    # @user_action
180                    # The comamnd /usr/bin/virsh define failed to define the domain. 
181                    # Determine if you have specified the correct domain name when
182                    # registering the resource.
183                    scds_syslog -p daemon.error -t $(syslog_tag) -m \
184                         "Failed to define %s using %s/%s.xml" \
185                         "${DOMAIN}" "${ADMIN}" "${RESOURCE}"
186 
187                    return 1
188                 fi
189            else
190                 # SCMSGS
191                 # @explanation
192                 # The domain does not exist.
193                 # @user_action
194                 # You must ensure that the domain exists. The preferred
195                 # method for creating a domain is to use virt-install. 
196                 # Refer to the virt-install(1M) man page for further
197                 # information.
198                 scds_syslog -p daemon.error -t $(syslog_tag) -m \
199                    "Domain %s does not exist" \
200                    "${DOMAIN}"
201 
202                 return 1
203            fi
204         fi
205 
206         # Tolerate a manually started domain and a NO-OP start
207         # otherwise start the domain.
208 
209         if [ -f ${ADMIN}/.noop_${RESOURCE} ]
210         then
211            # SCMSGS
212            # @explanation
213            # The domain was migrated or live migrated.
214            # @user_action
215            # None required. Informational message.
216            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
217               "NO-OP START being performed"
218 
219            /usr/bin/rm ${ADMIN}/.noop_${RESOURCE}
220            debug_message "start_domain - ${ADMIN}/.noop_${RESOURCE} deleted"
221            
222         elif echo $(/usr/bin/virsh domstate ${DOMAIN}) | /usr/xpg4/bin/grep -q -E "running|blocked|paused|in shutdown" > /dev/null 2>&1
223         then
224            # SCMSGS
225            # @explanation
226            # The domain was manually started.
227            # @user_action
228            # None required. Informational message.
229            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
230               "Domain %s was manually started" \
231               "${DOMAIN}"
232         else
233            if /usr/bin/virsh start ${DOMAIN} > /dev/null 2>&1
234            then
235                 # SCMSGS
236                 # @explanation
237                 # The domain was started successfully.
238                 # @user_action
239                 # None required. Informational message.
240                 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
241                    "Domain %s started" \
242                    "${DOMAIN}"
243            else
244                 # SCMSGS
245                 # @explanation
246                 # The domain failed to start.
247                 # @user_action
248                 # Check the syslog for further messages. If possible 
249                 # the cluster will attempt to restart the domain.
250                 scds_syslog -p daemon.error -t $(syslog_tag) -m \
251                    "Domain %s failed to start" \
252                    "${DOMAIN}"
253 
254                 rc=1
255            fi
256         fi
257 
258         if [ ${rc} -eq 0 ]
259         then
260            # Dump the domain configuration into an XML file. This file is then 
261            # used on another cluster node to define the domain but only if the
262            # domain does not exist.
263 
264            if /usr/bin/virsh dumpxml ${DOMAIN} > ${ADMIN}/${RESOURCE}.xml
265            then
266                 debug_message "start_domain - ${ADMIN}/${RESOURCE}.xml created"
267            else
268                 # SCMSGS
269                 # @explanation
270                 # /usr/bin/virsh dumpxml for domain failed.
271                 # @user_action
272                 # Determine why the /usr/bin/virsh dumpxml command failed.
273                 scds_syslog -p daemon.error -t $(syslog_tag) -m \
274                    "/usr/bin/virsh dumpxml %s > %s/%s.xml failed" \
275                    "${DOMAIN}" "${ADMIN}" "${RESOURCE}"
276            fi
277         fi
278 
279         debug_message "Function: start_domain - End"
280         return ${rc}
281 }
282 
283 check_domain()
284 {
285         debug_message "Function: check_domain - Begin"
286         ${SET_DEBUG}
287 
288         typeset rc
289 
290         if /usr/bin/pgrep -f "control_xvm start -R ${RESOURCE} " >/dev/null 2>&1
291         then
292            debug_message "Function: check_domain - start program is still running "
293            rc=100
294         else
295            domstate=$(/usr/bin/virsh domstate ${DOMAIN})
296 
297            case "${domstate}" in
298 
299                 # Acceptable run states
300 
301                 "running"|"blocked"|"paused"|"in shutdown")
302 
303                    if [ "${#PLUGIN_PROBE}" -ne 0 ]
304                    then
305                         if [ -x "$(echo ${PLUGIN_PROBE} | /usr/bin/awk '{print $1}')" ]
306                         then
307                            PROBE_TIMEOUT=$(/usr/cluster/bin/scha_resource_get -O Extension \
308                                 -R ${RESOURCE} -G ${RESOURCEGROUP} Probe_timeout)
309 
310                            PROBE_TIMEOUT=$(/usr/bin/echo ${PROBE_TIMEOUT} | /usr/bin/awk '{print $2}')
311 
312                            # Run the supplied probe with only 90% of PROBE_TIMEOUT. Also note that this
313                            # is supplied as a parameter to the PLUGIN_PROBE.
314 
315                            HATIMERUN_TIMEOUT=$(/usr/bin/expr ${PROBE_TIMEOUT} \* 90 \/ 100)
316 
317                            output=$(/usr/cluster/bin/hatimerun -t ${HATIMERUN_TIMEOUT} -k 9 ${PLUGIN_PROBE} ${HATIMERUN_TIMEOUT})
318                            rc=$?
319 
320                            case ${rc} in
321                                 0)      debug_message "check_domain - ${DOMAIN} ${output}"
322                                         rc=0
323                                         ;;
324                                 99)
325                                         # SCMSGS
326                                         # @explanation
327                                         # The domain probe timed out.
328                                         # @user_action
329                                         # Ensure that ${PLUGIN_PROBE} can complete within
330                                         # 90% of PROBE_TIMEOUT. 
331                                         scds_syslog -p daemon.error -t $(syslog_tag) -m \
332                                            "%s did not complete within %s seconds" \
333                                            "${PLUGIN_PROBE}" "${HATIMERUN_TIMEOUT}"
334                                 
335                                         rc=100
336                                         ;;
337                                 100)    if /usr/bin/pgrep -f "gds_svc_start .*-R ${RESOURCE} " >/dev/null 2>&1
338                                         then
339                                            debug_message "check_domain - ${DOMAIN} is still starting"
340                                            rc=100
341                                         elif /usr/bin/pgrep -f "gds_svc_stop .*-R ${RESOURCE} " >/dev/null 2>&1
342                                         then
343                                            debug_message "check_domain - ${DOMAIN} is stopping"
344                                            rc=100
345                                         else
346                                            # SCMSGS
347                                            # @explanation
348                                            # The domain probe has requested a domain restart.
349                                            # @user_action
350                                            # None. A domain restart will be attempted.
351                                            scds_syslog -p daemon.error -t $(syslog_tag) -m \
352                                                 "% has requested a domain restart %s" \
353                                                 "${PLUGIN_PROBE}" "${output}"
354 
355                                            rc=100
356                                         fi
357                                         ;;
358                                 201)    if /usr/bin/pgrep -f "gds_svc_start .*-R ${RESOURCE} " >/dev/null 2>&1
359                                         then
360                                            debug_message "check_domain - ${DOMAIN} is still starting"
361                                            rc=100
362                                         elif /usr/bin/pgrep -f "gds_svc_stop .*-R ${RESOURCE} " >/dev/null 2>&1
363                                         then
364                                            debug_message "check_domain - ${DOMAIN} is stopping"
365                                            rc=100
366                                         else
367                                            # SCMSGS
368                                            # @explanation
369                                            # The domain has requested an immediate failover.
370                                            # @user_action
371                                            # None. The domain will be immediately failed over.
372                                            scds_syslog -p daemon.error -t $(syslog_tag) -m \
373                                                 "%s has requested an immediate failover" \
374                                                 "${PLUGIN_PROBE}"
375 
376                                            rc=201
377                                         fi
378                                         ;;
379                                 *)      
380                                         # SCMSGS
381                                         # @explanation
382                                         # ${PLUGIN_PROBE} did not return 0, 100 or 201.
383                                         # @user_action
384                                         # None. A domain restart will be attempted.
385                                         scds_syslog -p daemon.error -t $(syslog_tag) -m \
386                                            "%s did not return 0, 100 or 201, a domain restart will be attempted" \
387                                            "${PLUGIN_PROBE}"
388                                         rc=100
389                                         ;;
390                            esac
391                         else
392                            # SCMSGS
393                            # @explanation
394                            # ${PLUGIN_PROBE} does not exist or is not executable.
395                            # @user_action
396                            # Check the pathname exists and that ${PLUGIN_PROBE} is executable.
397                            scds_syslog -p daemon.error -t $(syslog_tag) -m \
398                                 "%s non-existent executable" \
399                                 "${PLUGIN_PROBE}"                                       
400                         
401                            rc=0
402                         fi
403                    else
404                         rc=0
405                    fi
406 
407                         ;;
408                            
409                 # Restartable run states
410 
411                 "shut off"|"crashed")
412 
413                         rc=100
414                         ;;
415 
416                 # Unknown run states
417         
418                 *)              
419                         rc=100
420                         ;;
421            esac
422 
423            debug_message "check_domain - ${DOMAIN} ${domstate}"
424 
425         fi
426 
427         debug_message "Function: check_domain - End"
428         return ${rc}
429 }
430 
431 stop_domain()
432 {
433         debug_message "Function: stop_domain - Begin"
434         ${SET_DEBUG}
435 
436         STOP_TIMEOUT=$(/usr/cluster/bin/scha_resource_get -O STOP_TIMEOUT \
437            -R ${RESOURCE} -G ${RESOURCEGROUP} )
438 
439         # Note that GDS will attempt to cleanup after 80% of STOP_TIMEOUT
440         # has been consumed.  In this regard, we only allocate a combined 
441         # 75% of STOP_TIMEOUT to MAX_MIGRATE_TIMEOUT and MAX_STOP_TIMEOUT.
442         # 
443         # This leaves 5% for domain_destroy() which maybe called if 
444         # domain_shutdown() exeecds it's timeout and finally domain_delete().
445 
446         MAX_MIGRATE_TIMEOUT=$(/usr/bin/expr ${STOP_TIMEOUT} \* 25 \/ 100)
447         MAX_STOP_TIMEOUT=$(/usr/bin/expr ${STOP_TIMEOUT} \* 50 \/ 100)
448         SECONDS=0
449 
450         # At resource creation, the administrator can determine the Failover_type.
451         # Valid values for Failover_type are 
452         #
453         # Failover_type="normal"
454         #   o Stop the resource (shutdown the domain)
455         #   o Failover the resource group from the source node to the target node
456         #   o Start the resource (start the domain)
457         #
458         # Failover_type="migrate"
459         #   o Suspend the domain on the source node
460         #   o Copy the domain's memory pages from the source node to the target node
461         #   o Resume the domain on the target node
462         #    
463         # Failover_type="migrate --live"
464         #   o Iteratively copy the domain's memory pages from the source node to the taregt node
465         #   o When pre-copy is no longer benefical, suspend the domain on the source node
466         #   o Copy the domain's remaning "dirty" pages from the source node to the taregt node
467         #   o Resume the domain on the target node
468         #
469         # Note that migraation or live migration is performed over the cluster interconnect.
470         #
471         # For migration or live migration to be attempted across Solaris Cluster xVM nodes 
472         # the following conditions must be met.
473         #
474         # - The target Solaris Cluster xVM node must be running the same xVM version.
475         #
476         # - The migration TCP port must be open and accepting connections from the source
477         #    Solaris Cluster xVM node.
478         #
479         # - There must be sufficient resources for the domain to run in.
480         #
481         # - If the conditions are met and migration or live migration is successful a NO-OP 
482         # STOP and START is performed. This will ensure a successful STOP and START to the 
483         # appropriate RGM callback methods. Furthermore, doing a NO-OP RGM failover will
484         # ensure that RGM subsequently actions any dependencies and that Solaris Cluster
485         # reflects the correct state and status of resource groups and resources.
486         #
487         # - If the conditions are met but migration or live migration is not successful a
488         # normal failover will be performed.
489         #
490         # - If the conditions are not met, migration or live migration will fail and a normal 
491         # failover will be performed. 
492         #
493         # However, before attempting a migration or live migration we need to determine if the
494         # resource is being disabled. To distinguish if the resource is being disabled we
495         # test the ON_OFF_SWITCH property of the resource.
496         #
497         # If the resource is being disabled the ON_OFF_SWITCH will be DISABLED before the STOP
498         # method is called. So, conversely if the ON_OFF_SWITCH is ENABLED the resource is not
499         # being disabled and instead the resource group is undergoing either a switch to
500         # another node or is being evacuated from the node. 
501         #
502         # - If the resource is being disabled we perform a normal shutdown, regardless of the
503         # Failover_type setting. 
504 
505         ON_OFF_SWITCH=$(/usr/cluster/bin/scha_resource_get -O ON_OFF_SWITCH -R ${RESOURCE} -G ${RESOURCEGROUP})
506 
507         debug_message "stop_domain - ON_OFF_SWITCH=${ON_OFF_SWITCH}"
508         debug_message "stop_domain - FAILOVER_TYPE=${FAILOVER_TYPE}"
509 
510         if [ "${ON_OFF_SWITCH}" = "DISABLED" ]
511         then
512            domain_shutdown
513         else
514            case "${FAILOVER_TYPE}" in
515                 normal)         domain_shutdown
516                                 rc=$?
517                                 ;;
518                 migrate*)       if ! domain_migrate
519                                 then
520                                    domain_shutdown
521                                 fi
522                                 rc=$?
523                                 ;;
524                 *)              
525                                 # SCMSGS
526                                 # @explanation
527                                 # Invalid Failover_type specified.
528                                 # @user_action
529                                 # Delete and reregister the resource with 
530                                 # a valid Failover_type entry.
531                                 scds_syslog -p daemon.error -t $(syslog_tag) -m \
532                                    "Invalid Failover_type=%s" \
533                                    "${FAILOVER_TYPE}"
534                                 rc=1
535                                 ;;
536            esac
537         fi
538         
539         debug_message "Function: stop_domain - End"
540         return ${rc}
541 }
542 
543 get_target_host()
544 {
545         debug_message "Function: get_target_host - Begin"
546         ${SET_DEBUG}
547 
548         typeset rc=1
549 
550         # Here, we need to determine the target host as the resource group is either being
551         # switched or the node, where the resoure grouop is online, is being evacuated.
552         #
553         # To determine the target host for a resource group switch we rely on the cluster
554         # command log file /var/cluster/logs/commandlog to supply the target host. We need to
555         # obtain the correct entry from the command log file and match against the following
556         #
557         #       <date> + ${RESOURCEGROUP} + "START" + "switch"
558         #
559         # after which we only save the nodename from a clrg or scswitch command. 
560         #
561         # Sample /var/cluster/log/commandlog output is as follows,
562         #
563         # 02/07/2008 08:45:13 pelko1 10548 root START - scswitch -z -g "xvm2-rg" -h "pelko2"
564         # 02/07/2008 08:45:38 pelko1 10548 root END 0
565         # 02/07/2008 09:01:35 pelko1 10874 root START - clrg "switch" -n "pelko1" "xvm2-rg"
566         # 02/07/2008 09:01:36 pelko1 10874 root END -20827641
567         #
568         # If we are unable to match an entry, as perhaps the entry was logged at <date>
569         # and we are checking at <date> + 1 second, i.e. we are checking just as the second
570         # entry is incrementing to the next second, we perform another check. In fact the
571         # last 10 seconds are checked from the commandlog. 
572         #
573         # Once we have matched an entry from /var/cluster/logs/commandlog, we verify that 
574         # the target host is a valid nodelist entry for the resource group. 
575         #
576         # - If we have a valid nodelist entry we then determine that target host's cluster
577         # interconnect hostname to perform the migration or live migration.
578         #
579         # - If we are unable to find a match for a switch, we need to consider that an evacuate
580         # node is being performed. However, if the node is being evacuated we will rely on 
581         # RGM to dertermine the nodename regardless if a mirgation or live migration was 
582         # requested. Subsequently, we perform a normal failover. This ensures that we do not
583         # migrate or live migrate the domain to a node that maybe different to the node
584         # selected by RGM. 
585         #
586         # So, suffice to say that if a "switch" match is not found, following the discovery
587         # that the resource is not just being disabled, and that a migrate or live migrate 
588         # was defined, we will always perform a normal failover.
589         #
590         # Note that the target host match is performed within check_commandlog().
591 
592         check_commandlog
593 
594         debug_message "get_target_host - ${TARGET_HOST} size=${#TARGET_HOST}"
595 
596         if [ "${#TARGET_HOST}" -eq 0 ]
597         then
598            # SCMSGS
599            # @explanation
600            # A target host was not found
601            # @user_action
602            # None required. The domain will not be migrated or live
603            # migrated instead a normal failover will be performed.
604            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
605                 "Target host not found, normal failover will be performed"
606 
607         elif [ ${TARGET_HOST} = "$(/usr/bin/uname -n)" ] || [ $(echo ${TARGET_HOST} | /usr/bin/grep [0-9]:global) ] 
608         then
609            # SCMSGS
610            # @explanation
611            # The node is being evecuated.
612            # @user_action
613            # None required. The domain will not be migrated or live
614            # migrated instead a normal failover will be performed.
615            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
616                 "Node is being evacuated, normal failover will be performed"
617 
618         else
619            for i in $(/usr/cluster/bin/scha_resourcegroup_get -O NODELIST -G ${RESOURCEGROUP})
620            do
621                 [[ "${i}" != "$(uname -n)" || "${i}" = "${TARGET_HOST}" ]] && rc=0 && break
622            done
623 
624            if [ "${rc}" -eq 0 ]
625            then
626                 PRIVATELINK_TARGET_HOST=$(/usr/cluster/bin/scha_cluster_get -O PRIVATELINK_HOSTNAME_NODE ${TARGET_HOST})
627                 debug_message "get_target_host - PRIVATELINK_TARGET_HOST=${PRIVATELINK_TARGET_HOST}"
628            else
629                 # SCMSGS
630                 # @explanation
631                 # The target host found in the command log file is not
632                 # a valid entry within the resource groups nodelist.
633                 # @user_action
634                 # None required. The domain will not be migrated or live
635                 # migrated instead a normal failover will be performed.
636                 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
637                    "Target host %s not matched with the resource group nodelist, normal failover will be performed" \
638                    "${TARGET_HOST}"
639            fi
640         fi
641 
642         debug_message "Function: get_target_host - End"
643         return ${rc}
644 }
645 
646 check_commandlog()
647 {
648         debug_message "Function: check_commandlog - Begin"
649         
650         # Get the current epoch time
651         typeset ETIME=$(/usr/bin/perl -e 'print time;')
652         typeset DATE=$(/usr/bin/date '+%m/%d/%Y')
653         i=10
654 
655         while (( $i > 0 ))
656         do
657            # Iteratively search the commandlog for a switch or evacuate, going back in time
658            # by one second each time. If a match is found we break out of the loop.
659            #
660            # The following may help to understand the iterative loop. 
661            #
662            # bash-3.2# ETIME=$(perl -e 'print time;')
663            # bash-3.2# echo $ETIME
664            # 1202814041
665            # bash-3.2# HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | awk '{print $4}')
666            # bash-3.2# echo $HHMMSS
667            # 03:00:41
668            # bash-3.2# ETIME=$(expr ${ETIME} - 1)
669            # bash-3.2# echo $ETIME
670            # 1202814040
671            # bash-3.2# HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | awk '{print $4}')
672            # bash-3.2# echo $HHMMSS
673            # 03:00:40
674            # bash-3.2#
675 
676            # Convert the epoch time into a readable format
677            HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | /usr/bin/awk '{print $4}')
678 
679            debug_message "check_commadlog - performed for ${DATE} ${HHMMSS}"
680 
681            # Check for a clrg switch or scswitch 
682            TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
683                 /usr/bin/grep -w START | /usr/bin/grep switch | /usr/bin/grep \"${RESOURCEGROUP}\" |\
684                 /usr/bin/sed -e 's/^.*-h //' -e 's/^.*-n //' | /usr/bin/awk '{print $1}' | /usr/xpg4/bin/tr -d '" ')
685 
686            [ "${#TARGET_HOST}" -ne 0 ] && break
687 
688            # Check for a clrg evacuate 
689            TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
690                 /usr/bin/grep -w START | /usr/bin/grep evacuate |\
691                  /usr/bin/sed -e 's/^.*-n //' | /usr/bin/awk '{print $1}' | /usr/xpg4/bin/tr -d '+" ' )
692 
693            [ "${#TARGET_HOST}" -ne 0 ] && break
694 
695            # Check for a scswitch -S
696            TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
697                 /usr/bin/grep -w START | /usr/bin/grep scswitch | /usr/bin/grep "\-S" |\
698                 /usr/bin/sed -e 's/^.*-h //' | /usr/bin/awk '{print $1}' | /usr/xpg4/bin/tr -d '\-SK" ' )
699 
700            [ "${#TARGET_HOST}" -ne 0 ] && break
701 
702            i=$(expr $i - 1)
703            ETIME=$(expr ${ETIME} - 1)
704         done
705 
706         debug_message "check_commandlog - TARGET_HOST=${TARGET_HOST}"
707 
708         debug_message "Function: check_commandlog - End"
709 }
710 
711 domain_migrate()
712 {
713         debug_message "Function: domain_migrate - Begin"
714         ${SET_DEBUG}
715 
716         typeset rc
717 
718         [ "${FAILOVER_TYPE}" = "migrate" ] && MSG="migrated"
719         [ "${FAILOVER_TYPE}" = "migrate --live" ] && MSG="live migrated"
720 
721         if get_target_host 
722         then
723            # SCMSGS
724            # @explanation
725            # The domain is being migrated or live migrated to the target host.
726            # @user_action
727            # None required. 
728            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
729                 "Domain %s is being %s to %s" \
730                 "${DOMAIN}" "${MSG}" "${TARGET_HOST}"
731 
732            debug_message "domain_migrate - Running /usr/sbin/xm ${FAILOVER_TYPE} ${DOMAIN} ${PRIVATELINK_TARGET_HOST}"
733 
734            /usr/cluster/bin/hatimerun -t ${MAX_MIGRATE_TIMEOUT} -k KILL \
735                 /usr/sbin/xm ${FAILOVER_TYPE} "${DOMAIN}" ${PRIVATELINK_TARGET_HOST} > /dev/null 2>&1
736            rc=$?
737 
738            if [ "${rc}" -eq 0 ]
739            then
740                 # SCMSGS
741                 # @explanation
742                 # The domain was migrated or live migrated to the target host.
743                 # @user_action
744                 # None required. The domain successfully migrated or live migrated 
745                 # from the source node to the target node.
746                 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
747                    "Domain %s successfully %s to %s" \
748                    "${DOMAIN}" "${MSG}" "${TARGET_HOST}"
749 
750                 # As the domain has been successfully migrated or live migrated
751                 # we need to indicate a successful stop by performing a NO-OP stop
752                 # and subsequently a successful start by performing a NO-OP start.
753                 
754                 touch ${ADMIN}/.noop_${RESOURCE}
755                 debug_message "domain_migrate - ${ADMIN}/.noop_${RESOURCE} created"
756 
757                 # SCMSGS
758                 # @explanation
759                 # The domain was migrated or live migrated.
760                 # @user_action
761                 # None required. Informational message.
762                 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
763                    "NO-OP STOP being performed" 
764 
765            elif [ "${rc}" -eq 99 ]
766            then
767                 # SCMSGS
768                 # @explanation
769                 # The domain migration or live migration timed out.
770                 # @user_action
771                 # None required. Informational message.
772                 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
773                    "Migration of domain %s timed out, the domain state is now shut off" \
774                    "${DOMAIN}" 
775 
776                 rc=1
777            else
778                 # SCMSGS
779                 # @explanation
780                 # The domain failed to migrate or live migrate to the target host.
781                 # @user_action
782                 # None required. The domain failed to migrate or live migrate 
783                 # from the source node to the target node. A normal failover
784                 # will be performed.
785                 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
786                    "Domain %s failed to %s to %s, normal failover will be performed" \
787                    "${DOMAIN}" "${MSG}" "${TARGET_HOST}" 
788 
789                 rc=1
790            fi
791         else
792            rc=1
793         fi
794 
795         # If the domain has successfully migrated, we will now delete the domain. 
796         #
797         # Doing this ensures that the domain is only defined and able to be started
798         # on one cluster node at a time. Domains can use shared storage between cluster
799         # nodes so it is very important that we prevent any data corruption if a domain
800         # gets manually started on multiple cluster nodes where shared storage is used. 
801         #
802         # Of course using SUNW.HAStoragePlus somewhat protects against this, however we 
803         # simply want to avoid any manual administrative errors performed by mistake.
804         #
805         # Note, unless the domain was migrated or live migrated, the domain is defined
806         # before startup using a previously dumped XML file for the administrative file
807         # system.
808 
809         [ "${rc}" -eq 0 ] && domain_delete
810 
811         debug_message "Function: domain_migrate - End"
812         return ${rc}
813 }
814         
815 domain_shutdown()
816 {
817         debug_message "Function: domain_shutdown - Begin"
818         ${SET_DEBUG}
819 
820         typeset rc
821 
822         # Corordinate with the domain OS to perform a graceful shutdown.
823         # Note that the virsh shutdown command returns before the domain
824         # has shutdown, as such we do not use hatimerun.
825 
826         if /usr/bin/virsh shutdown ${DOMAIN} > /dev/null 2>&1
827         then
828 
829            # Loop to test if the domain shuts down gracefully
830            # or if the shutdown time is exceeded.
831 
832            while [ "${SECONDS}" -lt "${MAX_STOP_TIMEOUT}" ]
833            do
834                 if echo $(/usr/bin/virsh domstate ${DOMAIN}) | /usr/xpg4/bin/grep -q -E "running|blocked|paused|in shutdown"
835                 then
836                    sleep 5
837                 else
838                    SECONDS=${MAX_STOP_TIMEOUT}
839                 fi
840            done
841 
842            if echo $(/usr/bin/virsh domstate ${DOMAIN}) | /usr/xpg4/bin/grep -q -E "running|blocked|paused|in shutdown"
843            then
844                 # SCMSGS
845                 # @explanation
846                 # The domain failed to shutdown gracefully.
847                 # @user_action
848                 # None required. The domain failed to shutdown 
849                 # gracefully and will now be immediately terminated.
850                 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
851                    "Domain %s failed to shutdown gracefully, immediate shutdown will now be performed" \
852                    "${DOMAIN}"
853 
854                 domain_destroy
855                 rc=$?
856            else
857                 # SCMSGS
858                 # @explanation
859                 # The domain was shutdown gracefully.
860                 # @user_action
861                 # None required. The domain has shutdown gracefully.
862                 scds_syslog -p daemon.info -t $(syslog_tag) -m \
863                    "Domain %s has been gracefully shutdown" \
864                    "${DOMAIN}"
865                 rc=0
866            fi
867 
868         else
869            # SCMSGS
870            # @explanation
871            # The /usr/bin/virsh shutdown command failed.
872            # @user_action
873            # None required. The domain will now be terminated immediately.
874            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
875                 "/usr/bin/virsh shutdown %s failed, immediate shutdown will now be performed" \
876                 "${DOMAIN}"
877 
878            domain_destroy
879            rc=$?
880         fi
881 
882         # If the domain has successfully shutdown, we will now delete the domain.
883         #
884         # Doing this ensures that the domain is only defined and able to be started
885         # on one cluster node at a time. Domains can use shared storage between cluster
886         # nodes so it is very important that we prevent any data corruption if a domain
887         # gets manually started on multiple cluster nodes where shared storage is used. 
888         #
889         # Of course using SUNW.HAStoragePlus somewhat protects against this, however we 
890         # simply want to avoid any manual administrative errors performed by mistake.
891         #
892         # Note, unless the domain was migrated or live migrated, the domain is defined
893         # before startup using a previously dumped XML file for the administrative file
894         # system.
895 
896         [ "${rc}" -eq 0 ] && domain_delete
897 
898         debug_message "Function: domain_shutdown - End"
899         return ${rc}
900 }
901 
902 domain_destroy()
903 {
904         debug_message "Function: domain_destroy - Begin"
905         ${SET_DEBUG}
906 
907         typeset rc
908 
909         if /usr/bin/virsh destroy ${DOMAIN} > /dev/null 2>&1
910         then
911            # SCMSGS
912            # @explanation
913            # The domain was immediately terminated.
914            # @user_action
915            # None required. The domain had previously failed to shutdown
916            # gracefully but has now been immediately terminated.
917            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
918                 "Domain %s has been immediately terminated" \
919                 "${DOMAIN}"
920            rc=0
921         else
922            # SCMSGS
923            # @explanation
924            # The /usr/bin/virsh destroy command failed.
925            # @user_action
926            # Determine why it was not possible to immediately terminate
927            # the domain. 
928            scds_syslog -p daemon.error -t $(syslog_tag) -m \
929                 "Domain %s failed to shutdown immediately" \
930                 "${DOMAIN}"
931            rc=1
932         fi
933 
934         debug_message "Function: domain_destroy - End"
935         return ${rc}
936 }
937 
938 domain_delete()
939 {
940         debug_message "Function: domain_delete - Begin"
941         ${SET_DEBUG}
942 
943         # The purpose of deleting the domain after shutdown is to avoid the possibility of 
944         # someone manually starting the domain on a different node. Doing so would compromise 
945         # the domain if shared storage was used for the domain. The domain's configuration
946         # is always dumped to the agent's administrative file system so that the domain can 
947         # be defined before startup.
948 
949         typeset rc
950 
951         if /usr/sbin/xm delete ${DOMAIN} > /dev/null 2>&1
952         then
953            # SCMSGS
954            # @explanation
955            # The domain was deleted.
956            # @user_action
957            # None required. The domain has been deleted as it 
958            # will be defined on another node. Deleting the domain
959            # on this node ensures that it can't be started on
960            # more than one cluster node at a time. 
961            scds_syslog -p daemon.notice -t $(syslog_tag) -m \
962                 "Domain %s has been deleted on this node" \
963                 "${DOMAIN}"
964            rc=0
965         else
966            # SCMSGS
967            # @explanation
968            # The /usr/sbin/xm delete command failed.
969            # @user_action
970            # Determine why it was not possible to delete the domain.
971            scds_syslog -p daemon.error -t $(syslog_tag) -m \
972                 "Failed to delete domain %s on this node" \
973                 "${DOMAIN}"
974            rc=1
975         fi
976 
977         debug_message "Function: domain_delete - End"
978 }