New functions.ksh
1 #
2 # CDDL HEADER START
3 #
4 # The contents of this file are subject to the terms of the
5 # Common Development and Distribution License (the License).
6 # You may not use this file except in compliance with the License.
7 #
8 # You can obtain a copy of the license at usr/src/CDDL.txt
9 # or http://www.opensolaris.org/os/licensing.
10 # See the License for the specific language governing permissions
11 # and limitations under the License.
12 #
13 # When distributing Covered Code, include this CDDL HEADER in each
14 # file and include the License file at usr/src/CDDL.txt.
15 # If applicable, add the following below this CDDL HEADER, with the
16 # fields enclosed by brackets [] replaced with your own identifying
17 # information: Portions Copyright [yyyy] [name of copyright owner]
18 #
19 # CDDL HEADER END
20 #
21 # Copyright 2008 Sun Microsystems, Inc. All rights reserved.
22 # Use is subject to license terms.
23 #
24 #ident "@(#)functions.ksh 1.1 08/11/19 SMI"
25 #
26
27 PKG=SUNWscxvm
28 TASK_COMMAND=""
29 RESOURCE_PROJECT_NAME=""
30 SCLOGGER=/usr/cluster/lib/sc/scds_syslog
31 LOGGER=/usr/bin/logger
32
33 syslog_tag()
34 {
35 ${SET_DEBUG}
36 print "SC[${PKG:-??}.${METHOD:-??}]:${RESOURCEGROUP:-??}:${RESOURCE:-??}"
37 }
38
39 scds_syslog()
40 {
41 if [ -f "${SCLOGGER}" ]
42 then
43 ${SCLOGGER} "$@" &
44 else
45 while getopts 'p:t:m' opt
46 do
47 case "${opt}" in
48 t) TAG=${OPTARG};;
49 p) PRI=${OPTARG};;
50 esac
51 done
52
53 shift $((${OPTIND} - 1))
54 LOG_STRING=$(/usr/bin/printf "$@")
55 ${LOGGER} -p ${PRI} -t ${TAG} ${LOG_STRING}
56 fi
57 }
58
59 debug_message()
60 {
61 if [ "${DEBUG}" = "${RESOURCE}" -o "${DEBUG}" = "ALL" ]
62 then
63 SET_DEBUG="set -x"
64 DEBUG_TEXT=${1}
65
66 scds_syslog -p daemon.debug -t $(syslog_tag) -m \
67 "%s" "${DEBUG_TEXT}"
68 else
69 SET_DEBUG=
70 fi
71 }
72
73 validate()
74 {
75 debug_message "Function: validate - Begin"
76 ${SET_DEBUG}
77
78 if [ "$(/usr/bin/uname -i)" != "i86xpv" ]
79 then
80 # SCMSGS
81 # @explanation
82 # Solaris is not booted with xVM.
83 # @user_action
84 # Ensure that the default boot grub menu is set to boot
85 # Solaris xVM.
86 scds_syslog -p daemon.error -t $(syslog_tag) -m \
87 "Node is not booted with xVM"
88
89 return 1
90 fi
91
92 if [[ "${FAILOVER_TYPE}" != "normal" && "${FAILOVER_TYPE}" != "migrate" && "${FAILOVER_TYPE}" != "migrate --live" ]]
93 then
94 # SCMSGS
95 # @explanation
96 # Incorrect Failover_type specified.
97 # @user_action
98 # Ensure that Failover_type="normal"|"migrate"|"migrate --live"
99 # is specified within /opt/SUNWscxvm/util/xvm_config.
100 scds_syslog -p daemon.error -t $(syslog_tag) -m \
101 "Failover_type %s is not valid" \
102 "${FAILOVER_TYPE}"
103
104 return 1
105 fi
106
107 if [ ! -d "${ADMIN}" ]
108 then
109 # SCMSGS
110 # @explanation
111 # The agent administrative pathname does not exist.
112 # @user_action
113 # Ensure that the agent administrative pathname is specified
114 # within /opt/SUNWscxvm/util/xvm_config.
115 scds_syslog -p daemon.error -t $(syslog_tag) -m \
116 "Administrative pathname not found"
117
118 return 1
119 fi
120
121 debug_message "Function: validate - End"
122 return 0
123 }
124
125 start_domain()
126 {
127 debug_message "Function: start_domain - Begin"
128 ${SET_DEBUG}
129
130 typeset rc=0
131
132 # Turn off PMF restart. Starting a domain does not leave
133 # a running pid as in a classic Solaris Cluster agent.
134
135 START_TIMEOUT=$(/usr/cluster/bin/scha_resource_get -O START_TIMEOUT \
136 -R ${RESOURCE} -G ${RESOURCEGROUP} )
137
138 sleep ${START_TIMEOUT} &
139 /usr/cluster/bin/pmfadm -s ${RESOURCEGROUP},${RESOURCE},0.svc
140
141 # Check if the domain exists.
142 #
143 # If the domain does not exist, we maybe starting the domain
144 # on a new cluster node following a failover. As such we will
145 # define the domain using the previously dumped XML file
146 # located within the agent's administraative file system.
147 #
148 # If the domain does exist either the domain was manually
149 # started or the domain was migrated or live migrated from
150 # another cluster node. As such we will use the already
151 # defined domain.
152 #
153 # Note that when the domain is successfully stopped the domain
154 # is deleted. We do this simply to avoid the domain from
155 # being manually started on multiple cluster nodes. See
156 # domain_delete() for more information.
157
158 if /usr/bin/virsh dominfo ${DOMAIN} > /dev/null 2>&1
159 then
160 debug_message "Validate - domain ${DOMAIN} exists"
161 else
162 if [ -f ${ADMIN}/${RESOURCE}.xml ]
163 then
164 if /usr/bin/virsh define ${ADMIN}/${RESOURCE}.xml > /dev/null 2>&1
165 then
166 # SCMSGS
167 # @explanation
168 # The domain is being defined using a XML file.
169 # @user_action
170 # None, the domain is being defined using a previously defined
171 # XML file when the domain was last successfully started.
172 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
173 "Domain %s defined using %s/%s.xml" \
174 "${DOMAIN}" "${ADMIN}" "${RESOURCE}"
175 else
176 # SCMSGS
177 # @explanation
178 # The domain failed to be defined using a XML file.
179 # @user_action
180 # The comamnd /usr/bin/virsh define failed to define the domain.
181 # Determine if you have specified the correct domain name when
182 # registering the resource.
183 scds_syslog -p daemon.error -t $(syslog_tag) -m \
184 "Failed to define %s using %s/%s.xml" \
185 "${DOMAIN}" "${ADMIN}" "${RESOURCE}"
186
187 return 1
188 fi
189 else
190 # SCMSGS
191 # @explanation
192 # The domain does not exist.
193 # @user_action
194 # You must ensure that the domain exists. The preferred
195 # method for creating a domain is to use virt-install.
196 # Refer to the virt-install(1M) man page for further
197 # information.
198 scds_syslog -p daemon.error -t $(syslog_tag) -m \
199 "Domain %s does not exist" \
200 "${DOMAIN}"
201
202 return 1
203 fi
204 fi
205
206 # Tolerate a manually started domain and a NO-OP start
207 # otherwise start the domain.
208
209 if [ -f ${ADMIN}/.noop_${RESOURCE} ]
210 then
211 # SCMSGS
212 # @explanation
213 # The domain was migrated or live migrated.
214 # @user_action
215 # None required. Informational message.
216 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
217 "NO-OP START being performed"
218
219 /usr/bin/rm ${ADMIN}/.noop_${RESOURCE}
220 debug_message "start_domain - ${ADMIN}/.noop_${RESOURCE} deleted"
221
222 elif echo $(/usr/bin/virsh domstate ${DOMAIN}) | /usr/xpg4/bin/grep -q -E "running|blocked|paused|in shutdown" > /dev/null 2>&1
223 then
224 # SCMSGS
225 # @explanation
226 # The domain was manually started.
227 # @user_action
228 # None required. Informational message.
229 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
230 "Domain %s was manually started" \
231 "${DOMAIN}"
232 else
233 if /usr/bin/virsh start ${DOMAIN} > /dev/null 2>&1
234 then
235 # SCMSGS
236 # @explanation
237 # The domain was started successfully.
238 # @user_action
239 # None required. Informational message.
240 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
241 "Domain %s started" \
242 "${DOMAIN}"
243 else
244 # SCMSGS
245 # @explanation
246 # The domain failed to start.
247 # @user_action
248 # Check the syslog for further messages. If possible
249 # the cluster will attempt to restart the domain.
250 scds_syslog -p daemon.error -t $(syslog_tag) -m \
251 "Domain %s failed to start" \
252 "${DOMAIN}"
253
254 rc=1
255 fi
256 fi
257
258 if [ ${rc} -eq 0 ]
259 then
260 # Dump the domain configuration into an XML file. This file is then
261 # used on another cluster node to define the domain but only if the
262 # domain does not exist.
263
264 if /usr/bin/virsh dumpxml ${DOMAIN} > ${ADMIN}/${RESOURCE}.xml
265 then
266 debug_message "start_domain - ${ADMIN}/${RESOURCE}.xml created"
267 else
268 # SCMSGS
269 # @explanation
270 # /usr/bin/virsh dumpxml for domain failed.
271 # @user_action
272 # Determine why the /usr/bin/virsh dumpxml command failed.
273 scds_syslog -p daemon.error -t $(syslog_tag) -m \
274 "/usr/bin/virsh dumpxml %s > %s/%s.xml failed" \
275 "${DOMAIN}" "${ADMIN}" "${RESOURCE}"
276 fi
277 fi
278
279 debug_message "Function: start_domain - End"
280 return ${rc}
281 }
282
283 check_domain()
284 {
285 debug_message "Function: check_domain - Begin"
286 ${SET_DEBUG}
287
288 typeset rc
289
290 if /usr/bin/pgrep -f "control_xvm start -R ${RESOURCE} " >/dev/null 2>&1
291 then
292 debug_message "Function: check_domain - start program is still running "
293 rc=100
294 else
295 domstate=$(/usr/bin/virsh domstate ${DOMAIN})
296
297 case "${domstate}" in
298
299 # Acceptable run states
300
301 "running"|"blocked"|"paused"|"in shutdown")
302
303 if [ "${#PLUGIN_PROBE}" -ne 0 ]
304 then
305 if [ -x "$(echo ${PLUGIN_PROBE} | /usr/bin/awk '{print $1}')" ]
306 then
307 PROBE_TIMEOUT=$(/usr/cluster/bin/scha_resource_get -O Extension \
308 -R ${RESOURCE} -G ${RESOURCEGROUP} Probe_timeout)
309
310 PROBE_TIMEOUT=$(/usr/bin/echo ${PROBE_TIMEOUT} | /usr/bin/awk '{print $2}')
311
312 # Run the supplied probe with only 90% of PROBE_TIMEOUT. Also note that this
313 # is supplied as a parameter to the PLUGIN_PROBE.
314
315 HATIMERUN_TIMEOUT=$(/usr/bin/expr ${PROBE_TIMEOUT} \* 90 \/ 100)
316
317 output=$(/usr/cluster/bin/hatimerun -t ${HATIMERUN_TIMEOUT} -k 9 ${PLUGIN_PROBE} ${HATIMERUN_TIMEOUT})
318 rc=$?
319
320 case ${rc} in
321 0) debug_message "check_domain - ${DOMAIN} ${output}"
322 rc=0
323 ;;
324 99)
325 # SCMSGS
326 # @explanation
327 # The domain probe timed out.
328 # @user_action
329 # Ensure that ${PLUGIN_PROBE} can complete within
330 # 90% of PROBE_TIMEOUT.
331 scds_syslog -p daemon.error -t $(syslog_tag) -m \
332 "%s did not complete within %s seconds" \
333 "${PLUGIN_PROBE}" "${HATIMERUN_TIMEOUT}"
334
335 rc=100
336 ;;
337 100) if /usr/bin/pgrep -f "gds_svc_start .*-R ${RESOURCE} " >/dev/null 2>&1
338 then
339 debug_message "check_domain - ${DOMAIN} is still starting"
340 rc=100
341 elif /usr/bin/pgrep -f "gds_svc_stop .*-R ${RESOURCE} " >/dev/null 2>&1
342 then
343 debug_message "check_domain - ${DOMAIN} is stopping"
344 rc=100
345 else
346 # SCMSGS
347 # @explanation
348 # The domain probe has requested a domain restart.
349 # @user_action
350 # None. A domain restart will be attempted.
351 scds_syslog -p daemon.error -t $(syslog_tag) -m \
352 "% has requested a domain restart %s" \
353 "${PLUGIN_PROBE}" "${output}"
354
355 rc=100
356 fi
357 ;;
358 201) if /usr/bin/pgrep -f "gds_svc_start .*-R ${RESOURCE} " >/dev/null 2>&1
359 then
360 debug_message "check_domain - ${DOMAIN} is still starting"
361 rc=100
362 elif /usr/bin/pgrep -f "gds_svc_stop .*-R ${RESOURCE} " >/dev/null 2>&1
363 then
364 debug_message "check_domain - ${DOMAIN} is stopping"
365 rc=100
366 else
367 # SCMSGS
368 # @explanation
369 # The domain has requested an immediate failover.
370 # @user_action
371 # None. The domain will be immediately failed over.
372 scds_syslog -p daemon.error -t $(syslog_tag) -m \
373 "%s has requested an immediate failover" \
374 "${PLUGIN_PROBE}"
375
376 rc=201
377 fi
378 ;;
379 *)
380 # SCMSGS
381 # @explanation
382 # ${PLUGIN_PROBE} did not return 0, 100 or 201.
383 # @user_action
384 # None. A domain restart will be attempted.
385 scds_syslog -p daemon.error -t $(syslog_tag) -m \
386 "%s did not return 0, 100 or 201, a domain restart will be attempted" \
387 "${PLUGIN_PROBE}"
388 rc=100
389 ;;
390 esac
391 else
392 # SCMSGS
393 # @explanation
394 # ${PLUGIN_PROBE} does not exist or is not executable.
395 # @user_action
396 # Check the pathname exists and that ${PLUGIN_PROBE} is executable.
397 scds_syslog -p daemon.error -t $(syslog_tag) -m \
398 "%s non-existent executable" \
399 "${PLUGIN_PROBE}"
400
401 rc=0
402 fi
403 else
404 rc=0
405 fi
406
407 ;;
408
409 # Restartable run states
410
411 "shut off"|"crashed")
412
413 rc=100
414 ;;
415
416 # Unknown run states
417
418 *)
419 rc=100
420 ;;
421 esac
422
423 debug_message "check_domain - ${DOMAIN} ${domstate}"
424
425 fi
426
427 debug_message "Function: check_domain - End"
428 return ${rc}
429 }
430
431 stop_domain()
432 {
433 debug_message "Function: stop_domain - Begin"
434 ${SET_DEBUG}
435
436 STOP_TIMEOUT=$(/usr/cluster/bin/scha_resource_get -O STOP_TIMEOUT \
437 -R ${RESOURCE} -G ${RESOURCEGROUP} )
438
439 # Note that GDS will attempt to cleanup after 80% of STOP_TIMEOUT
440 # has been consumed. In this regard, we only allocate a combined
441 # 75% of STOP_TIMEOUT to MAX_MIGRATE_TIMEOUT and MAX_STOP_TIMEOUT.
442 #
443 # This leaves 5% for domain_destroy() which maybe called if
444 # domain_shutdown() exeecds it's timeout and finally domain_delete().
445
446 MAX_MIGRATE_TIMEOUT=$(/usr/bin/expr ${STOP_TIMEOUT} \* 25 \/ 100)
447 MAX_STOP_TIMEOUT=$(/usr/bin/expr ${STOP_TIMEOUT} \* 50 \/ 100)
448 SECONDS=0
449
450 # At resource creation, the administrator can determine the Failover_type.
451 # Valid values for Failover_type are
452 #
453 # Failover_type="normal"
454 # o Stop the resource (shutdown the domain)
455 # o Failover the resource group from the source node to the target node
456 # o Start the resource (start the domain)
457 #
458 # Failover_type="migrate"
459 # o Suspend the domain on the source node
460 # o Copy the domain's memory pages from the source node to the target node
461 # o Resume the domain on the target node
462 #
463 # Failover_type="migrate --live"
464 # o Iteratively copy the domain's memory pages from the source node to the taregt node
465 # o When pre-copy is no longer benefical, suspend the domain on the source node
466 # o Copy the domain's remaning "dirty" pages from the source node to the taregt node
467 # o Resume the domain on the target node
468 #
469 # Note that migraation or live migration is performed over the cluster interconnect.
470 #
471 # For migration or live migration to be attempted across Solaris Cluster xVM nodes
472 # the following conditions must be met.
473 #
474 # - The target Solaris Cluster xVM node must be running the same xVM version.
475 #
476 # - The migration TCP port must be open and accepting connections from the source
477 # Solaris Cluster xVM node.
478 #
479 # - There must be sufficient resources for the domain to run in.
480 #
481 # - If the conditions are met and migration or live migration is successful a NO-OP
482 # STOP and START is performed. This will ensure a successful STOP and START to the
483 # appropriate RGM callback methods. Furthermore, doing a NO-OP RGM failover will
484 # ensure that RGM subsequently actions any dependencies and that Solaris Cluster
485 # reflects the correct state and status of resource groups and resources.
486 #
487 # - If the conditions are met but migration or live migration is not successful a
488 # normal failover will be performed.
489 #
490 # - If the conditions are not met, migration or live migration will fail and a normal
491 # failover will be performed.
492 #
493 # However, before attempting a migration or live migration we need to determine if the
494 # resource is being disabled. To distinguish if the resource is being disabled we
495 # test the ON_OFF_SWITCH property of the resource.
496 #
497 # If the resource is being disabled the ON_OFF_SWITCH will be DISABLED before the STOP
498 # method is called. So, conversely if the ON_OFF_SWITCH is ENABLED the resource is not
499 # being disabled and instead the resource group is undergoing either a switch to
500 # another node or is being evacuated from the node.
501 #
502 # - If the resource is being disabled we perform a normal shutdown, regardless of the
503 # Failover_type setting.
504
505 ON_OFF_SWITCH=$(/usr/cluster/bin/scha_resource_get -O ON_OFF_SWITCH -R ${RESOURCE} -G ${RESOURCEGROUP})
506
507 debug_message "stop_domain - ON_OFF_SWITCH=${ON_OFF_SWITCH}"
508 debug_message "stop_domain - FAILOVER_TYPE=${FAILOVER_TYPE}"
509
510 if [ "${ON_OFF_SWITCH}" = "DISABLED" ]
511 then
512 domain_shutdown
513 else
514 case "${FAILOVER_TYPE}" in
515 normal) domain_shutdown
516 rc=$?
517 ;;
518 migrate*) if ! domain_migrate
519 then
520 domain_shutdown
521 fi
522 rc=$?
523 ;;
524 *)
525 # SCMSGS
526 # @explanation
527 # Invalid Failover_type specified.
528 # @user_action
529 # Delete and reregister the resource with
530 # a valid Failover_type entry.
531 scds_syslog -p daemon.error -t $(syslog_tag) -m \
532 "Invalid Failover_type=%s" \
533 "${FAILOVER_TYPE}"
534 rc=1
535 ;;
536 esac
537 fi
538
539 debug_message "Function: stop_domain - End"
540 return ${rc}
541 }
542
543 get_target_host()
544 {
545 debug_message "Function: get_target_host - Begin"
546 ${SET_DEBUG}
547
548 typeset rc=1
549
550 # Here, we need to determine the target host as the resource group is either being
551 # switched or the node, where the resoure grouop is online, is being evacuated.
552 #
553 # To determine the target host for a resource group switch we rely on the cluster
554 # command log file /var/cluster/logs/commandlog to supply the target host. We need to
555 # obtain the correct entry from the command log file and match against the following
556 #
557 # <date> + ${RESOURCEGROUP} + "START" + "switch"
558 #
559 # after which we only save the nodename from a clrg or scswitch command.
560 #
561 # Sample /var/cluster/log/commandlog output is as follows,
562 #
563 # 02/07/2008 08:45:13 pelko1 10548 root START - scswitch -z -g "xvm2-rg" -h "pelko2"
564 # 02/07/2008 08:45:38 pelko1 10548 root END 0
565 # 02/07/2008 09:01:35 pelko1 10874 root START - clrg "switch" -n "pelko1" "xvm2-rg"
566 # 02/07/2008 09:01:36 pelko1 10874 root END -20827641
567 #
568 # If we are unable to match an entry, as perhaps the entry was logged at <date>
569 # and we are checking at <date> + 1 second, i.e. we are checking just as the second
570 # entry is incrementing to the next second, we perform another check. In fact the
571 # last 10 seconds are checked from the commandlog.
572 #
573 # Once we have matched an entry from /var/cluster/logs/commandlog, we verify that
574 # the target host is a valid nodelist entry for the resource group.
575 #
576 # - If we have a valid nodelist entry we then determine that target host's cluster
577 # interconnect hostname to perform the migration or live migration.
578 #
579 # - If we are unable to find a match for a switch, we need to consider that an evacuate
580 # node is being performed. However, if the node is being evacuated we will rely on
581 # RGM to dertermine the nodename regardless if a mirgation or live migration was
582 # requested. Subsequently, we perform a normal failover. This ensures that we do not
583 # migrate or live migrate the domain to a node that maybe different to the node
584 # selected by RGM.
585 #
586 # So, suffice to say that if a "switch" match is not found, following the discovery
587 # that the resource is not just being disabled, and that a migrate or live migrate
588 # was defined, we will always perform a normal failover.
589 #
590 # Note that the target host match is performed within check_commandlog().
591
592 check_commandlog
593
594 debug_message "get_target_host - ${TARGET_HOST} size=${#TARGET_HOST}"
595
596 if [ "${#TARGET_HOST}" -eq 0 ]
597 then
598 # SCMSGS
599 # @explanation
600 # A target host was not found
601 # @user_action
602 # None required. The domain will not be migrated or live
603 # migrated instead a normal failover will be performed.
604 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
605 "Target host not found, normal failover will be performed"
606
607 elif [ ${TARGET_HOST} = "$(/usr/bin/uname -n)" ] || [ $(echo ${TARGET_HOST} | /usr/bin/grep [0-9]:global) ]
608 then
609 # SCMSGS
610 # @explanation
611 # The node is being evecuated.
612 # @user_action
613 # None required. The domain will not be migrated or live
614 # migrated instead a normal failover will be performed.
615 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
616 "Node is being evacuated, normal failover will be performed"
617
618 else
619 for i in $(/usr/cluster/bin/scha_resourcegroup_get -O NODELIST -G ${RESOURCEGROUP})
620 do
621 [[ "${i}" != "$(uname -n)" || "${i}" = "${TARGET_HOST}" ]] && rc=0 && break
622 done
623
624 if [ "${rc}" -eq 0 ]
625 then
626 PRIVATELINK_TARGET_HOST=$(/usr/cluster/bin/scha_cluster_get -O PRIVATELINK_HOSTNAME_NODE ${TARGET_HOST})
627 debug_message "get_target_host - PRIVATELINK_TARGET_HOST=${PRIVATELINK_TARGET_HOST}"
628 else
629 # SCMSGS
630 # @explanation
631 # The target host found in the command log file is not
632 # a valid entry within the resource groups nodelist.
633 # @user_action
634 # None required. The domain will not be migrated or live
635 # migrated instead a normal failover will be performed.
636 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
637 "Target host %s not matched with the resource group nodelist, normal failover will be performed" \
638 "${TARGET_HOST}"
639 fi
640 fi
641
642 debug_message "Function: get_target_host - End"
643 return ${rc}
644 }
645
646 check_commandlog()
647 {
648 debug_message "Function: check_commandlog - Begin"
649
650 # Get the current epoch time
651 typeset ETIME=$(/usr/bin/perl -e 'print time;')
652 typeset DATE=$(/usr/bin/date '+%m/%d/%Y')
653 i=10
654
655 while (( $i > 0 ))
656 do
657 # Iteratively search the commandlog for a switch or evacuate, going back in time
658 # by one second each time. If a match is found we break out of the loop.
659 #
660 # The following may help to understand the iterative loop.
661 #
662 # bash-3.2# ETIME=$(perl -e 'print time;')
663 # bash-3.2# echo $ETIME
664 # 1202814041
665 # bash-3.2# HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | awk '{print $4}')
666 # bash-3.2# echo $HHMMSS
667 # 03:00:41
668 # bash-3.2# ETIME=$(expr ${ETIME} - 1)
669 # bash-3.2# echo $ETIME
670 # 1202814040
671 # bash-3.2# HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | awk '{print $4}')
672 # bash-3.2# echo $HHMMSS
673 # 03:00:40
674 # bash-3.2#
675
676 # Convert the epoch time into a readable format
677 HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | /usr/bin/awk '{print $4}')
678
679 debug_message "check_commadlog - performed for ${DATE} ${HHMMSS}"
680
681 # Check for a clrg switch or scswitch
682 TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
683 /usr/bin/grep -w START | /usr/bin/grep switch | /usr/bin/grep \"${RESOURCEGROUP}\" |\
684 /usr/bin/sed -e 's/^.*-h //' -e 's/^.*-n //' | /usr/bin/awk '{print $1}' | /usr/xpg4/bin/tr -d '" ')
685
686 [ "${#TARGET_HOST}" -ne 0 ] && break
687
688 # Check for a clrg evacuate
689 TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
690 /usr/bin/grep -w START | /usr/bin/grep evacuate |\
691 /usr/bin/sed -e 's/^.*-n //' | /usr/bin/awk '{print $1}' | /usr/xpg4/bin/tr -d '+" ' )
692
693 [ "${#TARGET_HOST}" -ne 0 ] && break
694
695 # Check for a scswitch -S
696 TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
697 /usr/bin/grep -w START | /usr/bin/grep scswitch | /usr/bin/grep "\-S" |\
698 /usr/bin/sed -e 's/^.*-h //' | /usr/bin/awk '{print $1}' | /usr/xpg4/bin/tr -d '\-SK" ' )
699
700 [ "${#TARGET_HOST}" -ne 0 ] && break
701
702 i=$(expr $i - 1)
703 ETIME=$(expr ${ETIME} - 1)
704 done
705
706 debug_message "check_commandlog - TARGET_HOST=${TARGET_HOST}"
707
708 debug_message "Function: check_commandlog - End"
709 }
710
711 domain_migrate()
712 {
713 debug_message "Function: domain_migrate - Begin"
714 ${SET_DEBUG}
715
716 typeset rc
717
718 [ "${FAILOVER_TYPE}" = "migrate" ] && MSG="migrated"
719 [ "${FAILOVER_TYPE}" = "migrate --live" ] && MSG="live migrated"
720
721 if get_target_host
722 then
723 # SCMSGS
724 # @explanation
725 # The domain is being migrated or live migrated to the target host.
726 # @user_action
727 # None required.
728 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
729 "Domain %s is being %s to %s" \
730 "${DOMAIN}" "${MSG}" "${TARGET_HOST}"
731
732 debug_message "domain_migrate - Running /usr/sbin/xm ${FAILOVER_TYPE} ${DOMAIN} ${PRIVATELINK_TARGET_HOST}"
733
734 /usr/cluster/bin/hatimerun -t ${MAX_MIGRATE_TIMEOUT} -k KILL \
735 /usr/sbin/xm ${FAILOVER_TYPE} "${DOMAIN}" ${PRIVATELINK_TARGET_HOST} > /dev/null 2>&1
736 rc=$?
737
738 if [ "${rc}" -eq 0 ]
739 then
740 # SCMSGS
741 # @explanation
742 # The domain was migrated or live migrated to the target host.
743 # @user_action
744 # None required. The domain successfully migrated or live migrated
745 # from the source node to the target node.
746 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
747 "Domain %s successfully %s to %s" \
748 "${DOMAIN}" "${MSG}" "${TARGET_HOST}"
749
750 # As the domain has been successfully migrated or live migrated
751 # we need to indicate a successful stop by performing a NO-OP stop
752 # and subsequently a successful start by performing a NO-OP start.
753
754 touch ${ADMIN}/.noop_${RESOURCE}
755 debug_message "domain_migrate - ${ADMIN}/.noop_${RESOURCE} created"
756
757 # SCMSGS
758 # @explanation
759 # The domain was migrated or live migrated.
760 # @user_action
761 # None required. Informational message.
762 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
763 "NO-OP STOP being performed"
764
765 elif [ "${rc}" -eq 99 ]
766 then
767 # SCMSGS
768 # @explanation
769 # The domain migration or live migration timed out.
770 # @user_action
771 # None required. Informational message.
772 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
773 "Migration of domain %s timed out, the domain state is now shut off" \
774 "${DOMAIN}"
775
776 rc=1
777 else
778 # SCMSGS
779 # @explanation
780 # The domain failed to migrate or live migrate to the target host.
781 # @user_action
782 # None required. The domain failed to migrate or live migrate
783 # from the source node to the target node. A normal failover
784 # will be performed.
785 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
786 "Domain %s failed to %s to %s, normal failover will be performed" \
787 "${DOMAIN}" "${MSG}" "${TARGET_HOST}"
788
789 rc=1
790 fi
791 else
792 rc=1
793 fi
794
795 # If the domain has successfully migrated, we will now delete the domain.
796 #
797 # Doing this ensures that the domain is only defined and able to be started
798 # on one cluster node at a time. Domains can use shared storage between cluster
799 # nodes so it is very important that we prevent any data corruption if a domain
800 # gets manually started on multiple cluster nodes where shared storage is used.
801 #
802 # Of course using SUNW.HAStoragePlus somewhat protects against this, however we
803 # simply want to avoid any manual administrative errors performed by mistake.
804 #
805 # Note, unless the domain was migrated or live migrated, the domain is defined
806 # before startup using a previously dumped XML file for the administrative file
807 # system.
808
809 [ "${rc}" -eq 0 ] && domain_delete
810
811 debug_message "Function: domain_migrate - End"
812 return ${rc}
813 }
814
815 domain_shutdown()
816 {
817 debug_message "Function: domain_shutdown - Begin"
818 ${SET_DEBUG}
819
820 typeset rc
821
822 # Corordinate with the domain OS to perform a graceful shutdown.
823 # Note that the virsh shutdown command returns before the domain
824 # has shutdown, as such we do not use hatimerun.
825
826 if /usr/bin/virsh shutdown ${DOMAIN} > /dev/null 2>&1
827 then
828
829 # Loop to test if the domain shuts down gracefully
830 # or if the shutdown time is exceeded.
831
832 while [ "${SECONDS}" -lt "${MAX_STOP_TIMEOUT}" ]
833 do
834 if echo $(/usr/bin/virsh domstate ${DOMAIN}) | /usr/xpg4/bin/grep -q -E "running|blocked|paused|in shutdown"
835 then
836 sleep 5
837 else
838 SECONDS=${MAX_STOP_TIMEOUT}
839 fi
840 done
841
842 if echo $(/usr/bin/virsh domstate ${DOMAIN}) | /usr/xpg4/bin/grep -q -E "running|blocked|paused|in shutdown"
843 then
844 # SCMSGS
845 # @explanation
846 # The domain failed to shutdown gracefully.
847 # @user_action
848 # None required. The domain failed to shutdown
849 # gracefully and will now be immediately terminated.
850 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
851 "Domain %s failed to shutdown gracefully, immediate shutdown will now be performed" \
852 "${DOMAIN}"
853
854 domain_destroy
855 rc=$?
856 else
857 # SCMSGS
858 # @explanation
859 # The domain was shutdown gracefully.
860 # @user_action
861 # None required. The domain has shutdown gracefully.
862 scds_syslog -p daemon.info -t $(syslog_tag) -m \
863 "Domain %s has been gracefully shutdown" \
864 "${DOMAIN}"
865 rc=0
866 fi
867
868 else
869 # SCMSGS
870 # @explanation
871 # The /usr/bin/virsh shutdown command failed.
872 # @user_action
873 # None required. The domain will now be terminated immediately.
874 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
875 "/usr/bin/virsh shutdown %s failed, immediate shutdown will now be performed" \
876 "${DOMAIN}"
877
878 domain_destroy
879 rc=$?
880 fi
881
882 # If the domain has successfully shutdown, we will now delete the domain.
883 #
884 # Doing this ensures that the domain is only defined and able to be started
885 # on one cluster node at a time. Domains can use shared storage between cluster
886 # nodes so it is very important that we prevent any data corruption if a domain
887 # gets manually started on multiple cluster nodes where shared storage is used.
888 #
889 # Of course using SUNW.HAStoragePlus somewhat protects against this, however we
890 # simply want to avoid any manual administrative errors performed by mistake.
891 #
892 # Note, unless the domain was migrated or live migrated, the domain is defined
893 # before startup using a previously dumped XML file for the administrative file
894 # system.
895
896 [ "${rc}" -eq 0 ] && domain_delete
897
898 debug_message "Function: domain_shutdown - End"
899 return ${rc}
900 }
901
902 domain_destroy()
903 {
904 debug_message "Function: domain_destroy - Begin"
905 ${SET_DEBUG}
906
907 typeset rc
908
909 if /usr/bin/virsh destroy ${DOMAIN} > /dev/null 2>&1
910 then
911 # SCMSGS
912 # @explanation
913 # The domain was immediately terminated.
914 # @user_action
915 # None required. The domain had previously failed to shutdown
916 # gracefully but has now been immediately terminated.
917 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
918 "Domain %s has been immediately terminated" \
919 "${DOMAIN}"
920 rc=0
921 else
922 # SCMSGS
923 # @explanation
924 # The /usr/bin/virsh destroy command failed.
925 # @user_action
926 # Determine why it was not possible to immediately terminate
927 # the domain.
928 scds_syslog -p daemon.error -t $(syslog_tag) -m \
929 "Domain %s failed to shutdown immediately" \
930 "${DOMAIN}"
931 rc=1
932 fi
933
934 debug_message "Function: domain_destroy - End"
935 return ${rc}
936 }
937
938 domain_delete()
939 {
940 debug_message "Function: domain_delete - Begin"
941 ${SET_DEBUG}
942
943 # The purpose of deleting the domain after shutdown is to avoid the possibility of
944 # someone manually starting the domain on a different node. Doing so would compromise
945 # the domain if shared storage was used for the domain. The domain's configuration
946 # is always dumped to the agent's administrative file system so that the domain can
947 # be defined before startup.
948
949 typeset rc
950
951 if /usr/sbin/xm delete ${DOMAIN} > /dev/null 2>&1
952 then
953 # SCMSGS
954 # @explanation
955 # The domain was deleted.
956 # @user_action
957 # None required. The domain has been deleted as it
958 # will be defined on another node. Deleting the domain
959 # on this node ensures that it can't be started on
960 # more than one cluster node at a time.
961 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
962 "Domain %s has been deleted on this node" \
963 "${DOMAIN}"
964 rc=0
965 else
966 # SCMSGS
967 # @explanation
968 # The /usr/sbin/xm delete command failed.
969 # @user_action
970 # Determine why it was not possible to delete the domain.
971 scds_syslog -p daemon.error -t $(syslog_tag) -m \
972 "Failed to delete domain %s on this node" \
973 "${DOMAIN}"
974 rc=1
975 fi
976
977 debug_message "Function: domain_delete - End"
978 }