1 #!/bin/ksh
2 #
3 # CDDL HEADER START
4 #
5 # The contents of this file are subject to the terms of the
6 # Common Development and Distribution License (the License).
7 # You may not use this file except in compliance with the License.
8 #
9 # You can obtain a copy of the license at usr/src/CDDL.txt
10 # or http://www.opensolaris.org/os/licensing.
11 # See the License for the specific language governing permissions
12 # and limitations under the License.
13 #
14 # When distributing Covered Code, include this CDDL HEADER in each
15 # file and include the License file at usr/src/CDDL.txt.
16 # If applicable, add the following below this CDDL HEADER, with the
17 # fields enclosed by brackets [] replaced with your own identifying
18 # information: Portions Copyright [yyyy] [name of copyright owner]
19 #
20 # CDDL HEADER END
21 #
22 # Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 # Use is subject to license terms.
24 #
25 # ident "%Z%%M% %I% %E% SMI"
26 #
27
28 PKG=SUNWscxvm
29 TASK_COMMAND=""
30 RESOURCE_PROJECT_NAME=""
31 CCR_TABLE=${VM}_"domain_config"
32 TMP_DIR="/var/tmp"
33 LOGFILE=${TMP_DIR}/${RESOURCE}_logfile
34
35 # Commands definition
36 SCLOGGER=/usr/cluster/lib/sc/scds_syslog
37 LOGGER=/usr/bin/logger
38 GREP=/usr/xpg4/bin/grep
39 AWK=/usr/bin/awk
40 PGREP=/usr/bin/pgrep
41 SLEEP=/usr/bin/sleep
42 TR=/usr/xpg4/bin/tr
43 SCHA_RESOURCE_GET=/usr/cluster/bin/scha_resource_get
44 SCHA_RESOURCEGROUP_GET=/usr/cluster/bin/scha_resourcegroup_get
45 SCHA_CLUSTER_GET=/usr/cluster/bin/scha_cluster_get
46 HATIMERUN=/usr/cluster/bin/hatimerun
47 LDM=/opt/SUNWldm/bin/ldm
48 VIRSH=/usr/bin/virsh
49 XM=/usr/sbin/xm
50 CCRADM=/usr/cluster/lib/sc/ccradm
51 CL_EXEC_CLIENT=/usr/cluster/lib/sc/cl_exec_client
52
53 syslog_tag()
54 {
55 ${SET_DEBUG}
56 print "SC[${PKG:-??}.${METHOD:-??}]:${RESOURCEGROUP:-??}:${RESOURCE:-??}"
57 }
58
59 scds_syslog()
60 {
61 if [ -f "${SCLOGGER}" ]
62 then
63 ${SCLOGGER} "$@" &
64 else
65 while getopts 'p:t:m' opt
66 do
67 case "${opt}" in
68 t) TAG=${OPTARG};;
69 p) PRI=${OPTARG};;
70 esac
71 done
72
73 shift $((${OPTIND} - 1))
74 LOG_STRING=$(/usr/bin/printf "$@")
75 ${LOGGER} -p ${PRI} -t ${TAG} ${LOG_STRING}
76 fi
77 }
78
79 i18n_message()
80 {
81 debug_message "Function: i18n_message - Begin"
82 ${DEBUG}
83
84 print -u2 $(/bin/printf "$@")
85
86 debug_message "Function: i18n_message - End"
87 return 0
88 }
89
90 debug_message()
91 {
92 typeset DEBUG_TEXT=
93
94 case ${DEBUG_LEVEL} in
95 0) # No debug msgs
96 SET_DEBUG=
97 ;;
98 1) # Begin and End msgs
99 SET_DEBUG=
100 DEBUG_TEXT=$(echo ${1} | ${GREP} -E 'Begin|End')
101 ;;
102 2) # All debug msgs
103 SET_DEBUG="set -x"
104 DEBUG_TEXT=${1}
105 ;;
106 esac
107
108 [[ -n "${DEBUG_TEXT}" ]] && \
109 scds_syslog -p daemon.debug -t $(syslog_tag) -m \
110 "%s" "${DEBUG_TEXT}"
111 }
112
113 log_message()
114 {
115 #
116 # Output a message to syslog as required
117 #
118
119 debug_message "Function: log_message - Begin"
120
121 ${SET_DEBUG}
122
123 if [ -s "${LOGFILE}" ]
124 then
125 PRIORITY=${1}
126 HEADER=${2}
127
128 #
129 # Ensure that the while loop only reads a closed file
130 #
131 strings ${LOGFILE} > ${LOGFILE}.copy
132 while read MSG_TXT
133 do
134 scds_syslog -p daemon.${PRIORITY} -t $(syslog_tag) -m \
135 "%s - %s" "${HEADER}" "${MSG_TXT}"
136 done < ${LOGFILE}.copy
137 fi
138
139 cat /dev/null > ${LOGFILE} > /dev/null
140 cat /dev/null > ${LOGFILE}.copy
141
142 debug_message "Function: log_message - End"
143 }
144
145
146 get_resource_property()
147 {
148 debug_message "Function: get_resource_property - Begin"
149 ${SET_DEBUG}
150
151 typeset RS=${1}
152 typeset PROPERTY=${2}
153 typeset rc
154
155 # Retrieve the property value.
156 OUTPUT=$(${SCHA_RESOURCE_GET} -O Extension -R ${RS} ${PROPERTY})
157 rc=${?}
158
159 debug_message "get_resource_property - " \
160 "scha_resource_get of property ${PROPERTY} returned ${rc}"
161
162 if (( ${rc} == 0 ))
163 then
164 # print the values
165 echo ${OUTPUT} | ${AWK} '{ \
166 if (NF > 1) for (i = 2; i <= NF; i++) print $i; else print "" }'
167 fi
168
169 debug_message "Function: get_resource_property - End"
170
171 return ${rc}
172 }
173
174
175 get_properties()
176 {
177 debug_message "Function: get_properties - Begin"
178 ${SET_DEBUG}
179
180 typeset -i rc
181 typeset props=$*
182
183 for prop in ${props}
184 do
185 # retrieve the property value
186 typeset val=$(get_resource_property ${RESOURCE} ${prop})
187 rc=${?}
188
189 if (( ${rc} == 0 ))
190 then
191 case ${prop} in
192 Domain_name) [[ -z ${DOMAIN} ]] && DOMAIN=${val};;
193 Migration_type) [[ -z ${MIGRATION_TYPE} ]] && MIGRATION_TYPE=${val};;
194 Plugin_probe) [[ -z ${PLUGIN_PROBE} ]] && PLUGIN_PROBE=${val};;
195 Password_file) [[ -z ${PASSWORD_FILE} ]] && PASSWORD_FILE=${val};;
196 Debug_level) [[ -z ${DEBUG_LEVEL} ]] && DEBUG_LEVEL=${val};;
197 esac
198 else
199 # SCMSGS
200 # @explanation
201 # The scha_resource_get call failed.
202 # @user_action
203 # Check the syslog for further messages.
204 scds_syslog -p daemon.error -t $(syslog_tag) -m \
205 "Cannot get the property %s of resource %s." \
206 "${prop}" "${RESOURCE}"
207 break
208 fi
209 done
210
211 debug_message "Function: get_properties - End"
212
213 return ${rc}
214 }
215
216 validate_xvm()
217 {
218 debug_message "Function: validate_xvm - Begin"
219 ${SET_DEBUG}
220
221 typeset rc=0
222 typeset msgtext
223
224 if [ "$(/usr/bin/uname -i)" != "i86xpv" ]
225 then
226 # SCMSGS
227 # @explanation
228 # Solaris is not booted with xVM.
229 # @user_action
230 # Ensure that the default boot grub menu is set to boot
231 # Solaris xVM.
232 scds_syslog -p daemon.error -t $(syslog_tag) -m \
233 "Node is not booted with xVM."
234
235 msgtext=$(gettext "Node is not booted with xVM.")
236 i18n_message "${msgtext}"
237
238 rc=1
239 fi
240
241 debug_message "Function: validate_xvm - End"
242
243 return ${rc}
244 }
245
246 validate_ldom()
247 {
248 debug_message "Function: validate_ldom - Begin"
249 ${SET_DEBUG}
250
251 typeset ncount=0
252 typeset msgtext
253
254 # Make sure that the password file is readable.
255 if [[ ${MIGRATION_TYPE} != "NORMAL" ]]
256 then
257 if [ -z "${PASSWORD_FILE}" ]
258 then
259 # SCMSGS
260 # @explanation
261 # Password file cannot be null.
262 # @user_action
263 # Ensure that a password file name is specified.
264 scds_syslog -p daemon.error -t $(syslog_tag) -m \
265 "Password file cannot be (null)."
266
267 msgtext=$(gettext "Password file cannot be (null).")
268 i18n_message "${msgtext}"
269
270 debug_message "Function: validate_ldom - End"
271 return 1
272 fi
273
274 if [[ ! -f "${PASSWORD_FILE}" ]] || [[ ! -r "${PASSWORD_FILE}" ]]
275 then
276 # SCMSGS
277 # @explanation
278 # Incorrect Password file specified.
279 # @user_action
280 # Ensure that a valid password file is specified.
281 scds_syslog -p daemon.error -t $(syslog_tag) -m \
282 "Invalid password file specified %s." \
283 "${PASSWORD_FILE}"
284
285 msgtext=$(gettext "Invalid password file specified %s.")
286 i18n_message "${msgtext}" "${PASSWORD_FILE}"
287
288 debug_message "Function: validate_ldom - End"
289 return 1
290 fi
291 fi
292
293 # Ensure that the control domain is a cluster node.
294 if ! ${LDM} ls > /dev/null 2>&1
295 then
296 # SCMSGS
297 # @explanation
298 # Self explanatory.
299 # @user_action
300 # Ensure that the resource is configured in
301 # control domain.
302 scds_syslog -p daemon.error -t $(syslog_tag) -m \
303 "The LDom Manager is running in configuration mode."
304
305 msgtext=$(gettext "The LDom Manager is running in configuration mode.")
306 i18n_message "${msgtext}"
307
308 debug_message "Function: validate_ldom - End"
309 return 1
310 fi
311
312 # Ensure that the failure-policy setting is set to "reset".
313 # If the control domain fails,this would allow the guest domains
314 # to panic.
315 policy=$(${LDM} list -o domain primary \
316 | ${AWK} -F"=" '$1~/failure-policy/ {print $2}')
317
318 if [ "${policy}" != "reset" ]
319 then
320 # SCMSGS
321 # @explanation
322 # Incorrect failure-policy setting for the domain.
323 # @user_action
324 # Ensure that the failure-policy for the domain is
325 # set to "reset" on the control domain.
326 scds_syslog -p daemon.error -t $(syslog_tag) -m \
327 "Invalid failure policy \"%s\" for %s domain." \
328 "${policy}" "primary"
329
330 msgtext=$(gettext "Invalid failure policy \"%s\" for %s domain.")
331 i18n_message "${msgtext}" "${policy}" "primary"
332
333 debug_message "Function: validate_ldom - End"
334 return 1
335 fi
336
337 # The CL_EXEC_CLIENT program executes a command on any of the
338 # cluster nodes or a zone or in a zone cluster. It then generates
339 # as output the exit status of command and the stdout and stderr
340 # messages. The valid options are:
341 # [ -z zoneclustername] The command is run on the zone cluster
342 # represented by the zonename.
343 # -C { TS | RT | FSS | FX } The scheduling class in which the
344 # command is to be run.
345 # -p pri Specifies the priority of the command in the given
346 # scheduling class.
347 # -n id[,id..] A comma seperated list of node ID's of a
348 # zone cluster or a node to run the command.
349 # -c cmd [Args] The command to be run along with its arguments.
350
351 for nodename in $(${SCHA_RESOURCEGROUP_GET} -O NODELIST -G ${RESOURCEGROUP})
352 do
353 if [[ "$(${SCHA_CLUSTER_GET} -O NodeState_Node ${nodename})" == "DOWN" ]]
354 then
355 continue
356 fi
357
358 nodeid=$(${SCHA_CLUSTER_GET} -O NODEID_NODENAME ${nodename})
359 output=$(${CL_EXEC_CLIENT} -n ${nodeid} -c "${LDM} list-domain ${DOMAIN}")
360 result=${?}
361 status=$(echo ${output} | ${AWK} '{print $6}')
362
363 if (( ${result} == 0 )) && (( ${status} == 0 ))
364 then
365 domstate=$(echo $output | ${AWK} -F" " '{print $18}')
366
367 if (( ${update} == 0)) && echo $domstate | ${GREP} -q -E "^active$|suspending|resuming|suspended|starting" > /dev/null 2>&1
368 then
369 # SCMSGS
370 # @explanation
371 # The domain is in an invalid state.
372 # @user_action
373 # Ensure that the domain is in inactive or bound state.
374 scds_syslog -p daemon.error -t $(syslog_tag) -m \
375 "Domain %s is in %s state on %s." \
376 "${DOMAIN}" "${domstate}" "${nodename}"
377
378 msgtext=$(gettext "Domain %s is in %s state on %s.")
379 i18n_message "${msgtext}" "${DOMAIN}" "${domstate}" "${nodename}"
380
381 debug_message "Function: validate_ldom - End"
382 return 1
383 fi
384
385 ncount=$((ncount+1))
386 nlist=$(echo ${nodename} ${nlist})
387
388 # dump domain confguration to ccr
389 if [[ "$(/usr/bin/hostname)" == "${nodename}" ]]
390 then
391 if ! dump_domain_config
392 then
393 debug_message "Function: validate_ldom - End"
394 return 1
395 fi
396 fi
397 fi
398 done
399
400 if (( ${ncount} == 0 ))
401 then
402 if ! ${CCRADM} showkey --key xml_${RESOURCE} ${CCR_TABLE} > /dev/null 2>&1
403 then
404 scds_syslog -p daemon.error -t $(syslog_tag) -m \
405 "Domain %s does not exist." \
406 "${DOMAIN}"
407
408 msgtext=$(gettext "Domain %s does not exist.")
409 i18n_message "${msgtext}" "${DOMAIN}"
410
411 return 1
412 fi
413 fi
414
415 if [[ ${ncount} -gt 1 ]]
416 then
417 # SCMSGS
418 # @explanation
419 # The domain is configured on multiple
420 # cluster nodes.
421 # @user_action
422 # Ensure that the domain is configured on one node
423 # of the cluster.
424 scds_syslog -p daemon.error -t $(syslog_tag) -m \
425 "Multiple domain %s configuration exists on %s." \
426 "${DOMAIN}" "${nlist}"
427
428 msgtext=$(gettext "Multiple domain %s configuration exists on %s.")
429 i18n_message "${msgtext}" "${DOMAIN}" "${nlist}"
430
431 return 1
432 fi
433
434 debug_message "Function: validate_ldom - End"
435 return 0
436 }
437
438 validate()
439 {
440 debug_message "Function: validate - Begin"
441 ${SET_DEBUG}
442
443 typeset rc
444
445 # Make sure that the plugin probe specified is readable.
446 if [[ -n "${PLUGIN_PROBE}" ]]
447 then
448 if [ -f "${PLUGIN_PROBE}" ] && [ ! -r "${PLUGIN_PROBE}" ]
449 then
450 # SCMSGS
451 # @explanation
452 # Incorrect user probe file specified.
453 # @user_action
454 # Ensure that a valid user probe file is specified.
455 scds_syslog -p daemon.error -t $(syslog_tag) -m \
456 "Invalid user probe file %s." \
457 "${PLUGIN_PROBE}"
458
459 msgtext=$(gettext "Invalid user probe file %s.")
460 i18n_message "${msgtext}" "${PLUGIN_PROBE}"
461
462 return 1
463 fi
464 fi
465
466 validate_${VM}
467 rc=${?}
468
469 debug_message "Function: validate - End"
470 return ${rc}
471 }
472
473 #
474 # get the domain status
475 #
476 get_xvm_status()
477 {
478 debug_message "Function: get_xvm_status - Begin"
479 ${SET_DEBUG}
480
481 typeset rc
482
483 ${VIRSH} domstate ${DOMAIN}
484 rc=${?}
485
486 debug_message "Function: get_xvm_status - End"
487 return ${rc}
488 }
489
490 get_ldom_status()
491 {
492 debug_message "Function: get_ldom_status - Begin"
493 ${SET_DEBUG}
494
495 typeset rc=1
496
497 OUTPUT=$(${LDM} list-domain ${DOMAIN})
498
499 if (( ${?} == 0 ))
500 then
501 echo ${OUTPUT} | ${AWK} '{print $10}'
502 rc=${?}
503 fi
504
505 debug_message "Function: get_ldom_status - End"
506 return ${rc}
507 }
508
509 #
510 # Routines to create the domain on the current cluster node.
511 #
512 add_xvm_domain()
513 {
514 debug_message "Function: add_xvm_domain - Begin"
515 ${SET_DEBUG}
516
517 typeset rc=0
518
519 if ! ${VIRSH} define ${TMP_DIR}/${RESOURCE}.xml >> $LOGFILE 2>&1
520 then
521 # SCMSGS
522 # @explanation
523 # Defining the domain using an XML file failed.
524 # @user_action
525 # The command /usr/bin/virsh define failed to define the domain.
526 # Determine if you have specified the correct domain name while
527 # registering the resource.
528 scds_syslog -p daemon.error -t $(syslog_tag) -m \
529 "Failed to define %s using %s/%s.xml." \
530 "${DOMAIN}" "${TMP_DIR}" "${RESOURCE}"
531 rc=1
532 fi
533
534 debug_message "Function: add_xvm_domain - End"
535 return ${rc}
536 }
537
538 add_ldom_domain()
539 {
540 debug_message "Function: add_ldom_domain - Begin"
541 ${SET_DEBUG}
542
543 typeset rc=0
544
545 if ! ${LDM} add-domain -i ${TMP_DIR}/${RESOURCE}.xml ${DOMAIN} >> $LOGFILE 2>&1
546 then
547 # SCMSGS
548 # @explanation
549 # Defining the domain using an XML file failed.
550 # @user_action
551 # The command /opt/SUNWldm/bin/ldm "add-domain"
552 # failed to define the domain. Determine if you
553 # have specified the correct domain name when
554 # registering the resource.
555 scds_syslog -p daemon.error -t $(syslog_tag) -m \
556 "Failed to add the domain %s using %s/%s.xml." \
557 "${DOMAIN}" "${TMP_DIR}" "${RESOURCE}"
558 rc=1
559 fi
560
561 debug_message "Function: add_ldom_domain - End"
562 return ${rc}
563 }
564
565 #
566 # test if domain is active
567 #
568 is_xvm_up()
569 {
570 debug_message "Function: is_xvm_up - Begin"
571 ${SET_DEBUG}
572
573 typeset rc=0
574
575 echo $(${VIRSH} domstate ${DOMAIN}) | \
576 ${GREP} -q -E "running|blocked|paused|in shutdown" > /dev/null 2>&1
577 rc=${?}
578
579 debug_message "Function: is_xvm_up - End"
580 return ${rc}
581 }
582
583 is_ldom_up()
584 {
585 debug_message "Function: is_ldom_up - Begin"
586 ${SET_DEBUG}
587
588 typeset rc=0
589
590 get_ldom_status | ${GREP} -q -E "^active$|^starting$" > /dev/null 2>&1
591 rc=${?}
592
593 debug_message "Function: is_ldom_up - End"
594 return ${rc}
595 }
596
597 #
598 # wrapper routines to start xvm or ldom domains
599 #
600 start_xvm()
601 {
602 debug_message "Function: start_xvm - Begin"
603 ${SET_DEBUG}
604
605 typeset rc=0
606
607 ${VIRSH} start ${DOMAIN} >> $LOGFILE 2>&1
608 rc=${?}
609
610 debug_message "Function: start_xvm - End"
611 return ${rc}
612 }
613
614 #
615 # After a crash/reboot of the node, the domain
616 # would be started and there would be multiple
617 # instances of the same domain across cluster
618 # nodes. Hence the domain is destroyed.
619 #
620 init_ldom()
621 {
622 debug_message "Function: init_ldom - Begin"
623 ${SET_DEBUG}
624
625 typeset rc
626
627 MAX_STOP_TIMEOUT=$(${SCHA_RESOURCE_GET} -O INIT_TIMEOUT \
628 -R ${RESOURCE} -G ${RESOURCEGROUP} )
629
630 domain_shutdown
631 rc=${?}
632
633 debug_message "Function: init_ldom - End"
634 return ${rc}
635 }
636
637 start_ldom()
638 {
639 debug_message "Function: start_ldom - Begin"
640 ${SET_DEBUG}
641
642 typeset rc=0
643
644 if get_${VM}_status | ${GREP} -q -E "^inactive$" > /dev/null 2>&1
645 then
646 if ${LDM} bind-domain ${DOMAIN} >> $LOGFILE 2>&1
647 then
648 # SCMSGS
649 # @explanation
650 # The domain was bound.
651 # @user_action
652 # None required. The domain has been bound on this node.
653 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
654 "Domain %s is bound." \
655 "${DOMAIN}"
656 rc=0
657 else
658 # SCMSGS
659 # @explanation
660 # The /opt/SUNWldm/bin/ldm bind-domain command failed.
661 # @user_action
662 # Determine why it was not possible to bind the domain.
663 scds_syslog -p daemon.error -t $(syslog_tag) -m \
664 "Failed to bind %s." \
665 "${DOMAIN}"
666 rc=1
667 fi
668
669 fi
670
671 #
672 # The domain is made to sit at the OBP prompt, so a reboot/crash
673 # wouldn't boot the Guest domain OS.
674 #
675 if (( ${rc} == 0 )) && ${LDM} set-var auto-boot?=true ${DOMAIN} >> $LOGFILE 2>&1
676 then
677 if ${LDM} start-domain ${DOMAIN} >> $LOGFILE 2>&1
678 then
679 while [ 1 ]
680 do
681 flag=$(${LDM} list-domain -p ${DOMAIN} | ${GREP} ${DOMAIN} \
682 | ${AWK} -F"|" '{print $4}'| ${AWK} -F"=" '{print $2}')
683 [[ "${flag}" == "-n----" ]] && break
684 ${SLEEP} 1
685 done
686 else
687 rc=1
688 fi
689 ${LDM} set-var auto-boot?=false ${DOMAIN} >> $LOGFILE 2>&1 || rc=1
690 else
691 rc=1
692 fi
693
694 debug_message "Function: start_ldom - End"
695 return ${rc}
696 }
697
698 start_domain()
699 {
700 debug_message "Function: start_domain - Begin"
701 ${SET_DEBUG}
702
703 typeset rc=0
704
705 # Turn off PMF restart. Starting a domain does not leave
706 # a running pid as in a classic Solaris Cluster agent.
707
708 START_TIMEOUT=$(${SCHA_RESOURCE_GET} -O START_TIMEOUT \
709 -R ${RESOURCE} -G ${RESOURCEGROUP} )
710
711 ${SLEEP} ${START_TIMEOUT} &
712 /usr/cluster/bin/pmfadm -s ${RESOURCEGROUP},${RESOURCE},0.svc
713
714 # Check if the domain exists.
715 #
716 # If the domain does not exist, we maybe starting the domain
717 # on a new cluster node following a failover. As such we will
718 # define the domain using the previously dumped XML file
719 # located within the agent's administrative file system.
720 #
721 # If the domain already exists, either the domain was manually
722 # started or the domain was migrated or live migrated from
723 # another cluster node. Therefore, we will use the already
724 # defined domain.
725 #
726 # Note that when the domain is successfully stopped the domain
727 # is deleted. We do this simply to avoid the domain from
728 # being manually started on multiple cluster nodes. See
729 # domain_delete() for more information.
730
731 if get_${VM}_status > /dev/null 2>&1
732 then
733 debug_message "Validate - domain ${DOMAIN} exists"
734 else
735 if ${CCRADM} showkey --key xml_${RESOURCE} ${CCR_TABLE} > ${TMP_DIR}/${RESOURCE}.xml 2> /dev/null
736 then
737 # add the domain to the cluster node
738 if add_${VM}_domain ${DOMAIN} ${TMP_DIR}/${RESOURCE}.xml
739 then
740 # SCMSGS
741 # @explanation
742 # The domain is being defined using a XML file.
743 # @user_action
744 # None, the domain is being defined using a previously defined
745 # XML file when the domain was last successfully started.
746 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
747 "Domain %s defined using %s/%s.xml." \
748 "${DOMAIN}" "${TMP_DIR}" "${RESOURCE}"
749 else
750 # error already logged.
751 debug_message "Function: start_domain - End"
752 return 1
753 fi
754 else
755 # SCMSGS
756 # @explanation
757 # The domain does not exist.
758 # @user_action
759 # You must ensure that the domain exists.
760 scds_syslog -p daemon.error -t $(syslog_tag) -m \
761 "Domain %s does not exist." \
762 "${DOMAIN}"
763
764 debug_message "Function: start_domain - End"
765 return 1
766 fi
767 fi
768
769 # Tolerate a manually started domain and a NO-OP start
770 # otherwise start the domain.
771
772 if ${CCRADM} showkey --key noop_${RESOURCE} ${CCR_TABLE} > /dev/null 2>&1
773 then
774 # SCMSGS
775 # @explanation
776 # The domain was migrated or live migrated.
777 # @user_action
778 # None required. Informational message.
779 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
780 "NO-OP START being performed."
781
782 if ! ${CCRADM} delkey --key noop_${RESOURCE} ${CCR_TABLE} >> $LOGFILE 2>&1
783 then
784 # SCMSGS
785 # @explanation
786 # Failed to delete the NO-OP flag from CCR.
787 # @user_action
788 # Check the syslog for further messages.
789 # Determine why the NO-OP flag was not added to the CCR.
790 scds_syslog -p daemon.error -t $(syslog_tag) -m \
791 "Failed to delete NO-OP flag for %s domain." \
792 "${DOMAIN}"
793
794 debug_message "Function: start_domain - End"
795 return 1
796 else
797 debug_message "start_domain - noop_${RESOURCE} deleted"
798 fi
799
800 elif is_${VM}_up
801 then
802 # SCMSGS
803 # @explanation
804 # The domain was manually started.
805 # @user_action
806 # None required. Informational message.
807 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
808 "Domain %s was manually started." \
809 "${DOMAIN}"
810 else
811 if start_${VM}
812 then
813 # SCMSGS
814 # @explanation
815 # The domain was started successfully.
816 # @user_action
817 # None required. Informational message.
818 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
819 "Domain %s started." \
820 "${DOMAIN}"
821 else
822 # SCMSGS
823 # @explanation
824 # The domain failed to start.
825 # @user_action
826 # Check the syslog for further messages. If possible
827 # the cluster will attempt to restart the domain.
828 scds_syslog -p daemon.error -t $(syslog_tag) -m \
829 "Domain %s failed to start." \
830 "${DOMAIN}"
831
832 rc=1
833 fi
834 fi
835
836 if (( ${rc} == 0 ))
837 then
838 # Dump the domain configuration into an XML file. This file is then
839 # used on another cluster node to define the domain but only if the
840 # domain does not exist.
841
842 dump_domain_config
843 rc=${?}
844 fi
845
846 debug_message "Function: start_domain - End"
847 return ${rc}
848 }
849
850 #
851 # dump the domain configuration
852 #
853 dump_xvm_xml()
854 {
855 debug_message "Function: dump_xvm_xml - Begin"
856 ${SET_DEBUG}
857
858 typeset rc=0
859
860 if ! ${VIRSH} dumpxml ${DOMAIN} 2>> $LOGFILE
861 then
862 # SCMSGS
863 # @explanation
864 # "/usr/bin/virsh dumpxml" for domain failed.
865 # @user_action
866 # Determine why the command to dump domain
867 # configuration failed.
868 scds_syslog -p daemon.error -t $(syslog_tag) -m \
869 "%s dumpxml for domain %s failed." \
870 "${VIRSH}" "${DOMAIN}"
871 rc=${?}
872 fi
873
874 debug_message "Function: dump_xvm_xml - End"
875 return ${rc}
876 }
877
878 dump_ldom_xml()
879 {
880 debug_message "Function: dump_ldom_xml - Begin"
881 ${SET_DEBUG}
882
883 typeset rc=0
884
885 if ! ${LDM} list-constraints -x ${DOMAIN} 2>> $LOGFILE
886 then
887 # SCMSGS
888 # @explanation
889 # "/opt/SUNWldm/bin/ldm list-constraints -x"
890 # for domain failed.
891 # @user_action
892 # Determine why the command to list the
893 # domain constraints failed.
894 scds_syslog -p daemon.error -t $(syslog_tag) -m \
895 "%s list-constraints for domain %s failed." \
896 "${LDM}" "${DOMAIN}"
897 rc=1
898 fi
899
900 debug_message "Function: dump_ldom_xml - End"
901 return ${rc}
902 }
903
904 #
905 # save the domain configuration in the cluster
906 # configuration repository
907 #
908 dump_domain_config()
909 {
910 debug_message "Function: dump_domain_config - Begin"
911 ${SET_DEBUG}
912
913 typeset rc=0
914
915 # Dump the domain configuration into an XML file. The domain configuration
916 # can be changed, when under the the agent control.
917
918 olddesc=$(${CCRADM} showkey --key xml_${RESOURCE} ${CCR_TABLE} 2> /dev/null)
919
920 if (( ${?} == 1 ))
921 then
922 #
923 # The ccr table might not exist.
924 # create the CCR table, if it doesn't exist.
925 #
926 if ${CCRADM} addtab ${CCR_TABLE} >> $LOGFILE 2>&1
927 then
928 debug_message "created ccr table ${CCR_TABLE}"
929 else
930 # SCMSGS
931 # @explanation
932 # Failed to create the CCR table.
933 # @user_action
934 # Check the syslog for further messages.
935 # Determine why the CCR create failed.
936 scds_syslog -p daemon.error -t $(syslog_tag) -m \
937 "Failed to create CCR table %s." \
938 "${CCR_TABLE}"
939
940 return 1
941 fi
942 fi
943
944 output=$(dump_${VM}_xml)
945 if (( ${?} == 0 )) && [[ -n "${output}" ]]
946 then
947 newdesc=$(echo ${output} | ${TR} -s '\n' '[ ]')
948 if [ "${olddesc}" != "${newdesc}" ]
949 then
950 if ! ${CCRADM} addkey --key=xml_${RESOURCE} --value "${newdesc}" ${CCR_TABLE} > /dev/null 2>&1
951 then
952 if ! ${CCRADM} changekey --key=xml_${RESOURCE} --value "${newdesc}" ${CCR_TABLE} >> $LOGFILE 2>&1
953 then
954 # SCMSGS
955 # @explanation
956 # Failed to update the XMl dump to the CCR.
957 # @user_action
958 # Check the syslog for further messages.
959 # Determine why the ccr update failed.
960 scds_syslog -p daemon.error -t $(syslog_tag) -m \
961 "Failed to update domain XML %s to ccr." \
962 "${DOMAIN}"
963
964 rc=1
965 fi
966 else
967 debug_message "dump_domain_config - %s configuration added to CCR" "${DOMAIN}"
968 fi
969 fi
970 else
971 # error already logged.
972 rc=1
973 fi
974
975 debug_message "Function: dump_domain_config - End"
976
977 return ${rc}
978 }
979
980 #
981 # probe function for domain data service
982 #
983 check_domain()
984 {
985 debug_message "Function: check_domain - Begin"
986 ${SET_DEBUG}
987
988 typeset rc
989 SECONDS=0
990
991 if ${PGREP} -f "control_xvm start -R ${RESOURCE} " >/dev/null 2>&1
992 then
993 debug_message "Function: check_domain - start program is still running "
994 rc=100
995 else
996 domstate=$(get_${VM}_status 2>/dev/null)
997
998 case "${domstate}" in
999
1000 # Acceptable run states
1001 "running"|"blocked"|"paused"|"in shutdown"| \
1002 "active"|"suspending"|"resuming"|"suspended"|"starting")
1003
1004 if [ "${#PLUGIN_PROBE}" -ne 0 ]
1005 then
1006 if [ -x "$(echo ${PLUGIN_PROBE} | ${AWK} '{print $1}')" ]
1007 then
1008 PROBE_TIMEOUT=$(${SCHA_RESOURCE_GET} -O Extension -R ${RESOURCE} -G ${RESOURCEGROUP} Probe_timeout|tail -1)
1009 # Run the supplied probe with only 90% of PROBE_TIMEOUT. Also note that this
1010 # is supplied as a parameter to the PLUGIN_PROBE.
1011
1012 HATIMERUN_TIMEOUT=$((PROBE_TIMEOUT*90/100-${SECONDS}))
1013
1014 output=$(${HATIMERUN} -t ${HATIMERUN_TIMEOUT} -k 9 ${PLUGIN_PROBE} ${HATIMERUN_TIMEOUT})
1015 rc=${?}
1016
1017 case ${rc} in
1018 0) debug_message "check_domain - ${DOMAIN} ${output}"
1019 rc=0
1020 ;;
1021 99)
1022 # SCMSGS
1023 # @explanation
1024 # The domain probe timed out.
1025 # @user_action
1026 # Ensure that ${PLUGIN_PROBE} can complete within
1027 # 90% of PROBE_TIMEOUT.
1028 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1029 "%s did not complete within %s seconds." \
1030 "${PLUGIN_PROBE}" "${HATIMERUN_TIMEOUT}"
1031
1032 rc=100
1033 ;;
1034 100) if ${PGREP} -f "gds_svc_start .*-R ${RESOURCE} " >/dev/null 2>&1
1035 then
1036 debug_message "check_domain - ${DOMAIN} is still starting"
1037 rc=100
1038 elif ${PGREP} -f "gds_svc_stop .*-R ${RESOURCE} " >/dev/null 2>&1
1039 then
1040 debug_message "check_domain - ${DOMAIN} is stopping"
1041 rc=100
1042 else
1043 # SCMSGS
1044 # @explanation
1045 # The domain probe has requested a domain restart.
1046 # @user_action
1047 # None. A domain restart will be attempted.
1048 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1049 "% has requested a domain restart %s." \
1050 "${PLUGIN_PROBE}" "${output}"
1051
1052 rc=100
1053 fi
1054 ;;
1055 201) if ${PGREP} -f "gds_svc_start .*-R ${RESOURCE} " >/dev/null 2>&1
1056 then
1057 debug_message "check_domain - ${DOMAIN} is still starting"
1058 rc=100
1059 elif ${PGREP} -f "gds_svc_stop .*-R ${RESOURCE} " >/dev/null 2>&1
1060 then
1061 debug_message "check_domain - ${DOMAIN} is stopping"
1062 rc=100
1063 else
1064 # SCMSGS
1065 # @explanation
1066 # The domain has requested an immediate failover.
1067 # @user_action
1068 # None. The domain will be immediately failed over.
1069 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1070 "%s has requested an immediate failover." \
1071 "${PLUGIN_PROBE}"
1072
1073 rc=201
1074 fi
1075 ;;
1076 *)
1077 # SCMSGS
1078 # @explanation
1079 # ${PLUGIN_PROBE} did not return 0, 100 or 201.
1080 # @user_action
1081 # None. A domain restart will be attempted.
1082 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1083 "%s did not return 0, 100 or 201, a domain restart will be attempted." \
1084 "${PLUGIN_PROBE}"
1085 rc=100
1086 ;;
1087 esac
1088 else
1089 # SCMSGS
1090 # @explanation
1091 # ${PLUGIN_PROBE} does not exist or is not executable.
1092 # @user_action
1093 # Check the pathname exists and that ${PLUGIN_PROBE} is executable.
1094 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1095 "%s non-existent executable." \
1096 "${PLUGIN_PROBE}"
1097
1098 rc=0
1099 fi
1100 else
1101 rc=0
1102 fi
1103
1104 ;;
1105
1106 # Restartable run states
1107
1108 "shut off"|"crashed"| \
1109 "inactive"|"stopping")
1110
1111 rc=100
1112 ;;
1113
1114 # Unknown run states
1115
1116 *)
1117 rc=100
1118 ;;
1119 esac
1120
1121 debug_message "check_domain - ${DOMAIN} ${domstate}"
1122
1123 fi
1124
1125 debug_message "Function: check_domain - End"
1126 return ${rc}
1127 }
1128
1129 stop_domain()
1130 {
1131 debug_message "Function: stop_domain - Begin"
1132 ${SET_DEBUG}
1133
1134 typeset rc=0
1135
1136 STOP_TIMEOUT=$(${SCHA_RESOURCE_GET} -O STOP_TIMEOUT \
1137 -R ${RESOURCE} -G ${RESOURCEGROUP} )
1138
1139 # Note that GDS will attempt to cleanup after 80% of STOP_TIMEOUT
1140 # has been consumed. In this regard, we only allocate a combined
1141 # 75% of STOP_TIMEOUT to MAX_MIGRATE_TIMEOUT and MAX_STOP_TIMEOUT.
1142 #
1143 # This leaves 5% for domain_destroy() which maybe called if
1144 # domain_shutdown() exeecds it's timeout and finally domain_delete().
1145
1146 MAX_MIGRATE_TIMEOUT=$((STOP_TIMEOUT*25/100))
1147 MAX_STOP_TIMEOUT=$((STOP_TIMEOUT*50/100))
1148 SECONDS=0
1149
1150 # Save the domain configuration changes.
1151 if ! dump_domain_config
1152 then
1153 debug_message "Function: stop_domain - End"
1154 return 1
1155 fi
1156
1157 # At resource creation, the administrator can determine the Migration_type.
1158 # Valid values for Migration_type are
1159 #
1160 # Migration_type="normal"
1161 # o Stop the resource (shutdown the domain)
1162 # o Failover the resource group from the source node to the target node
1163 # o Start the resource (start the domain)
1164 #
1165 # Migration_type="migrate"
1166 # o Suspend the domain on the source node
1167 # o Copy the domain's memory pages from the source node to the target node
1168 # o Resume the domain on the target node
1169 #
1170 # Migration_type="migrate_live"
1171 # o Iteratively copy the domain's memory pages from the source node to the taregt node
1172 # o When pre-copy is no longer benefical, suspend the domain on the source node
1173 # o Copy the domain's remaning "dirty" pages from the source node to the taregt node
1174 # o Resume the domain on the target node
1175 #
1176 # Note that migration or live migration is performed over the cluster interconnect.
1177 #
1178 # For migration or live migration to be attempted across Solaris Cluster xVM nodes
1179 # the following conditions must be met.
1180 #
1181 # - The target Solaris Cluster xVM node must be running the same xVM version.
1182 #
1183 # - The migration TCP port must be open and accepting connections from the source
1184 # Solaris Cluster xVM node.
1185 #
1186 # - There must be sufficient resources for the domain to run in.
1187 #
1188 # - If the conditions are met and migration or live migration is successful a NO-OP
1189 # STOP and START is performed. This will ensure a successful STOP and START to the
1190 # appropriate RGM callback methods. Furthermore, doing a NO-OP RGM failover will
1191 # ensure that RGM subsequently actions any dependencies and that Solaris Cluster
1192 # reflects the correct state and status of resource groups and resources.
1193 #
1194 # - If the conditions are met but migration or live migration is not successful a
1195 # normal failover will be performed.
1196 #
1197 # - If the conditions are not met, migration or live migration will fail and a normal
1198 # failover will be performed.
1199 #
1200 # However, before attempting a migration or live migration we need to determine if the
1201 # resource is being disabled. To distinguish if the resource is being disabled we
1202 # test the ON_OFF_SWITCH property of the resource.
1203 #
1204 # If the resource is being disabled the ON_OFF_SWITCH will be DISABLED before the STOP
1205 # method is called. So, conversely if the ON_OFF_SWITCH is ENABLED the resource is not
1206 # being disabled and instead the resource group is undergoing either a switch to
1207 # another node or is being evacuated from the node.
1208 #
1209 # - If the resource is being disabled we perform a normal shutdown, regardless of the
1210 # Migration_type setting.
1211
1212 ON_OFF_SWITCH=$(${SCHA_RESOURCE_GET} -O ON_OFF_SWITCH -R ${RESOURCE} -G ${RESOURCEGROUP})
1213
1214 debug_message "stop_domain - ON_OFF_SWITCH=${ON_OFF_SWITCH}"
1215 debug_message "stop_domain - MIGRATION_TYPE=${MIGRATION_TYPE}"
1216
1217 if [[ "${ON_OFF_SWITCH}" = "DISABLED" ]]
1218 then
1219 domain_shutdown
1220 else
1221 case "${MIGRATION_TYPE}" in
1222 NORMAL) domain_shutdown
1223 rc=${?}
1224 ;;
1225 MIGRATE*) if ! domain_migrate
1226 then
1227 domain_shutdown
1228 fi
1229 rc=${?}
1230 ;;
1231 *)
1232 # SCMSGS
1233 # @explanation
1234 # Invalid Migration_type specified.
1235 # @user_action
1236 # Delete and reregister the resource with
1237 # a valid Migration_type entry.
1238 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1239 "Invalid Migration_type=%s." \
1240 "${MIGRATION_TYPE}"
1241 rc=1
1242 ;;
1243 esac
1244 fi
1245
1246 debug_message "Function: stop_domain - End"
1247 return ${rc}
1248 }
1249
1250 get_target_host()
1251 {
1252 debug_message "Function: get_target_host - Begin"
1253 ${SET_DEBUG}
1254
1255 typeset rc=1
1256
1257 # Here, we need to determine the target host as the resource group is either being
1258 # switched or the node, where the resoure group is online, is being evacuated.
1259 #
1260 # To determine the target host for a resource group switch we rely on the cluster
1261 # command log file /var/cluster/logs/commandlog to supply the target host. We need to
1262 # obtain the correct entry from the command log file and match against the following
1263 #
1264 # <date> + ${RESOURCEGROUP} + "START" + "switch"
1265 #
1266 # after which we only save the nodename from a clrg or scswitch command.
1267 #
1268 # Sample /var/cluster/log/commandlog output is as follows,
1269 #
1270 # 02/07/2008 08:45:13 pelko1 10548 root START - scswitch -z -g "xvm2-rg" -h "pelko2"
1271 # 02/07/2008 08:45:38 pelko1 10548 root END 0
1272 # 02/07/2008 09:01:35 pelko1 10874 root START - clrg "switch" -n "pelko1" "xvm2-rg"
1273 # 02/07/2008 09:01:36 pelko1 10874 root END -20827641
1274 #
1275 # If we are unable to match an entry, as perhaps the entry was logged at <date>
1276 # and we are checking at <date> + 1 second, i.e. we are checking just as the second
1277 # entry is incrementing to the next second, we perform another check. In fact the
1278 # last 10 seconds are checked from the commandlog.
1279 #
1280 # Once we have matched an entry from /var/cluster/logs/commandlog, we verify that
1281 # the target host is a valid nodelist entry for the resource group.
1282 #
1283 # - If we have a valid nodelist entry we then determine that target host's cluster
1284 # interconnect hostname to perform the migration or live migration.
1285 #
1286 # - If we are unable to find a match for a switch, we need to consider that an evacuate
1287 # node is being performed. However, if the node is being evacuated we will rely on
1288 # RGM to dertermine the nodename regardless if a mirgation or live migration was
1289 # requested. Subsequently, we perform a normal failover. This ensures that we do not
1290 # migrate or live migrate the domain to a node that maybe different to the node
1291 # selected by RGM.
1292 #
1293 # So, suffice to say that if a "switch" match is not found, following the discovery
1294 # that the resource is not just being disabled, and that a migrate or live migrate
1295 # was defined, we will always perform a normal failover.
1296 #
1297 # Note that the target host match is performed within check_commandlog().
1298
1299 check_commandlog
1300
1301 debug_message "get_target_host - ${TARGET_HOST} size=${#TARGET_HOST}"
1302
1303 if [ "${#TARGET_HOST}" -eq 0 ]
1304 then
1305 # SCMSGS
1306 # @explanation
1307 # A target host was not found
1308 # @user_action
1309 # None required. The domain will not be migrated or live
1310 # migrated instead a normal failover will be performed.
1311 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1312 "Target host not found, normal failover will be performed."
1313
1314 elif [ ${TARGET_HOST} = "$(/usr/bin/uname -n)" ] || [ $(echo ${TARGET_HOST} | /usr/bin/grep [0-9]:global) ]
1315 then
1316 # SCMSGS
1317 # @explanation
1318 # The node is being evecuated.
1319 # @user_action
1320 # None required. The domain will not be migrated or live
1321 # migrated. Instead, a normal failover will be performed.
1322 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1323 "Node is being evacuated, normal failover will be performed."
1324
1325 else
1326 for i in $(${SCHA_RESOURCEGROUP_GET} -O NODELIST -G ${RESOURCEGROUP})
1327 do
1328 [[ "${i}" != "$(uname -n)" || "${i}" = "${TARGET_HOST}" ]] && rc=0 && break
1329 done
1330
1331 if [ "${rc}" -eq 0 ]
1332 then
1333 PRIVATELINK_TARGET_HOST=$(${SCHA_CLUSTER_GET} -O PRIVATELINK_HOSTNAME_NODE ${TARGET_HOST})
1334 debug_message "get_target_host - PRIVATELINK_TARGET_HOST=${PRIVATELINK_TARGET_HOST}"
1335 else
1336 # SCMSGS
1337 # @explanation
1338 # The target host found in the command log file is not
1339 # a valid entry within the resource groups nodelist.
1340 # @user_action
1341 # None required. The domain will not be migrated or live
1342 # migrated instead a normal failover will be performed.
1343 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1344 "Target host %s not matched with the resource group nodelist, normal failover will be performed." \
1345 "${TARGET_HOST}"
1346 fi
1347 fi
1348
1349 debug_message "Function: get_target_host - End"
1350 return ${rc}
1351 }
1352
1353 check_commandlog()
1354 {
1355 debug_message "Function: check_commandlog - Begin"
1356
1357 # Get the current epoch time
1358 typeset ETIME=$(/usr/bin/perl -e 'print time;')
1359 typeset DATE=$(/usr/bin/date '+%m/%d/%Y')
1360 i=10
1361
1362 while (( $i > 0 ))
1363 do
1364 # Iteratively search the commandlog for a switch or evacuate, going back in time
1365 # by one second each time. If a match is found we break out of the loop.
1366 #
1367 # The following may help to understand the iterative loop.
1368 #
1369 # bash-3.2# ETIME=$(perl -e 'print time;')
1370 # bash-3.2# echo $ETIME
1371 # 1202814041
1372 # bash-3.2# HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | awk '{print $4}')
1373 # bash-3.2# echo $HHMMSS
1374 # 03:00:41
1375 # bash-3.2# ETIME=$(expr ${ETIME} - 1)
1376 # bash-3.2# echo $ETIME
1377 # 1202814040
1378 # bash-3.2# HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | awk '{print $4}')
1379 # bash-3.2# echo $HHMMSS
1380 # 03:00:40
1381 # bash-3.2#
1382
1383 # Convert the epoch time into a readable format
1384 HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | ${AWK} '{print $4}')
1385
1386 debug_message "check_commadlog - performed for ${DATE} ${HHMMSS}"
1387
1388 # Check for a clrg switch or scswitch
1389 TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
1390 /usr/bin/grep -w START | /usr/bin/grep switch | /usr/bin/grep \"${RESOURCEGROUP}\" |\
1391 /usr/bin/sed -e 's/^.*-h //' -e 's/^.*-n //' | ${AWK} '{print $1}' | ${TR} -d '" ')
1392
1393 [ "${#TARGET_HOST}" -ne 0 ] && break
1394
1395 # Check for a clrg evacuate
1396 TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
1397 /usr/bin/grep -w START | /usr/bin/grep evacuate |\
1398 /usr/bin/sed -e 's/^.*-n //' | ${AWK} '{print $1}' | ${TR} -d '+" ' )
1399
1400 [ "${#TARGET_HOST}" -ne 0 ] && break
1401
1402 # Check for a scswitch -S
1403 TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
1404 /usr/bin/grep -w START | /usr/bin/grep scswitch | /usr/bin/grep "\-S" |\
1405 /usr/bin/sed -e 's/^.*-h //' | ${AWK} '{print $1}' | ${TR} -d '\-SK" ' )
1406
1407 [ "${#TARGET_HOST}" -ne 0 ] && break
1408
1409 i=$(expr $i - 1)
1410 ETIME=$(expr ${ETIME} - 1)
1411 done
1412
1413 debug_message "check_commandlog - TARGET_HOST=${TARGET_HOST}"
1414
1415 debug_message "Function: check_commandlog - End"
1416 }
1417
1418 #
1419 # routines to perform domain migration
1420 #
1421 migrate_xvm()
1422 {
1423 debug_message "Function: migrate_xvm - Begin"
1424 ${SET_DEBUG}
1425
1426 typeset rc=0
1427
1428 [[ "${MIGRATION_TYPE}" = "MIGRATE" ]] && OPTION="migrate"
1429 [[ "${MIGRATION_TYPE}" = "MIGRATE_LIVE" ]] && OPTION="migrate --live"
1430
1431 debug_message "domain_migrate - Running /usr/sbin/xm ${OPTION} ${DOMAIN} ${PRIVATELINK_TARGET_HOST}"
1432
1433 ${HATIMERUN} -t ${MAX_MIGRATE_TIMEOUT} -k KILL \
1434 ${XM} ${MIGRATION_TYPE} "${DOMAIN}" ${PRIVATELINK_TARGET_HOST} > /dev/null 2>&1
1435 rc=${?}
1436
1437 debug_message "Function: migrate_xvm - End"
1438 return ${rc}
1439 }
1440
1441 migrate_ldom()
1442 {
1443 debug_message "Function: migrate_ldom - Begin"
1444 ${SET_DEBUG}
1445
1446 typeset rc=0
1447
1448 [[ "${MIGRATION_TYPE}" = "MIGRATE" ]] && OPTION="migrate"
1449
1450 debug_message "domain_migrate - Running /opt/SUNWscxvm/bin/ldm_migrate ${OPTION} ${DOMAIN} ${PRIVATELINK_TARGET_HOST}"
1451
1452 ${HATIMERUN} -t ${MAX_MIGRATE_TIMEOUT} -k KILL \
1453 /opt/SUNWscxvm/bin/ldm_migrate ${OPTION} "${DOMAIN}" ${PRIVATELINK_TARGET_HOST} ${PASSWORD_FILE} >> $LOGFILE 2>&1
1454 rc=${?}
1455
1456 debug_message "Function: migrate_ldom - End"
1457 return ${rc}
1458 }
1459
1460 #
1461 # routines to cancel migration
1462 #
1463 cancel_xvm_migration()
1464 {
1465 # NO OP for a xvm domain
1466 return 0
1467 }
1468
1469 cancel_ldom_migration()
1470 {
1471 debug_message "Function: cancel_ldom_migration - Begin"
1472 ${SET_DEBUG}
1473
1474 # cancel domain migration for ldoms
1475 if ${LDM} cancel-operation migration ${DOMAIN} >> $LOGFILE 2>&1
1476 then
1477 # SCMSGS
1478 # @explanation
1479 # The domain migration operation was cancelled.
1480 # @user_action
1481 # None required. Informational message.
1482 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1483 "Migration of domain %s is cancelled, the domain state is now in active state." \
1484 "${DOMAIN}"
1485 fi
1486
1487 while (( ${SECONDS} < ${MAX_STOP_TIMEOUT} ))
1488 do
1489 if get_${VM}_status | ${GREP} -q -E "^suspending|^resuming|^suspended|^starting" > /dev/null 2>&1
1490 then
1491 sleep 5
1492 else
1493 SECONDS=${MAX_STOP_TIMEOUT}
1494 fi
1495 done
1496
1497 debug_message "Function: cancel_ldom_migration - End"
1498 }
1499
1500 domain_migrate()
1501 {
1502 debug_message "Function: domain_migrate - Begin"
1503 ${SET_DEBUG}
1504
1505 typeset rc
1506
1507 [[ "${MIGRATION_TYPE}" = "MIGRATE" ]] && MSG="migrated"
1508 [[ "${MIGRATION_TYPE}" = "MIGRATE_LIVE" ]] && MSG="live migrated"
1509
1510 if get_target_host
1511 then
1512 # SCMSGS
1513 # @explanation
1514 # The domain is being migrated or live migrated to the target host.
1515 # @user_action
1516 # None required.
1517 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1518 "Domain %s is being %s to %s." \
1519 "${DOMAIN}" "${MSG}" "${TARGET_HOST}"
1520
1521 migrate_${VM} ${MIGRATION_TYPE} ${DOMAIN} ${PRIVATELINK_TARGET_HOST}
1522 rc=${?}
1523
1524 if (( ${rc} == 0 ))
1525 then
1526 # SCMSGS
1527 # @explanation
1528 # The domain was migrated or live migrated to the target host.
1529 # @user_action
1530 # None required. The domain successfully migrated or live migrated
1531 # from the source node to the target node.
1532 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1533 "Domain %s successfully %s to %s." \
1534 "${DOMAIN}" "${MSG}" "${TARGET_HOST}"
1535
1536 # As the domain has been successfully migrated or live migrated
1537 # we need to indicate a successful stop by performing a NO-OP stop
1538 # and subsequently a successful start by performing a NO-OP start.
1539
1540 if ${CCRADM} addkey --key=noop_${RESOURCE} --value="1" ${CCR_TABLE} >> $LOGFILE 2>&1
1541 then
1542 debug_message "domain_migrate - .noop_${RESOURCE} flag added to CCR"
1543 else
1544 # SCMSGS
1545 # @explanation
1546 # Failed to update the XMl configuration to the CCR.
1547 # @user_action
1548 # Check the syslog for further messages.
1549 # Determine why the ccr update failed.
1550 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1551 "Failed to add NO-OP flag for %s to ccr." \
1552 "${DOMAIN}"
1553 rc=1
1554 fi
1555
1556 # SCMSGS
1557 # @explanation
1558 # The domain was migrated or live migrated.
1559 # @user_action
1560 # None required. Informational message.
1561 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1562 "NO-OP STOP being performed."
1563
1564 elif (( ${rc} == 99 ))
1565 then
1566 # SCMSGS
1567 # @explanation
1568 # The domain migration or live migration timed out.
1569 # @user_action
1570 # None required. Informational message.
1571 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1572 "Migration of domain %s timed out, the domain state is now shut off." \
1573 "${DOMAIN}"
1574
1575 rc=1
1576 cancel_${VM}_migration
1577 else
1578 # SCMSGS
1579 # @explanation
1580 # The domain failed to migrate or live migrate to the target host.
1581 # @user_action
1582 # None required. The domain failed to migrate or live migrate
1583 # from the source node to the target node. A normal failover
1584 # will be performed.
1585 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1586 "Domain %s failed to %s to %s, normal failover will be performed." \
1587 "${DOMAIN}" "${MSG}" "${TARGET_HOST}"
1588
1589 rc=1
1590 cancel_${VM}_migration
1591 fi
1592 else
1593 rc=1
1594 fi
1595
1596 # If the domain has successfully migrated, we will now delete the domain.
1597 #
1598 # Doing this ensures that the domain is only defined and able to be started
1599 # on one cluster node at a time. Domains can use shared storage between cluster
1600 # nodes so it is very important that we prevent any data corruption if a domain
1601 # gets manually started on multiple cluster nodes where shared storage is used.
1602 #
1603 # Of course using SUNW.HAStoragePlus somewhat protects against this, however we
1604 # simply want to avoid any manual administrative errors performed by mistake.
1605 #
1606 # Note, unless the domain was migrated or live migrated, the domain is defined
1607 # before startup using a previously dumped XML file for the administrative file
1608 # system.
1609
1610 (( ${rc} == 0 )) && [[ "${VM}" == "xvm" ]] && domain_delete
1611
1612 debug_message "Function: domain_migrate - End"
1613 return ${rc}
1614 }
1615
1616 #
1617 # routines to perform domain shutdown
1618 #
1619 shutdown_xvm()
1620 {
1621 debug_message "Function: shutdown_xvm - Begin"
1622 ${SET_DEBUG}
1623
1624 typeset rc=0
1625
1626 # Note that the virsh shutdown command returns before the domain
1627 # has shutdown, as such we do not use hatimerun.
1628
1629 ${VIRSH} shutdown ${DOMAIN} > /dev/null 2>&1
1630 rc=${?}
1631
1632 debug_message "Function: shutdown_xvm - End"
1633 return ${rc}
1634 }
1635
1636 shutdown_ldom()
1637 {
1638 debug_message "Function: shutdown_ldom - Begin"
1639 ${SET_DEBUG}
1640
1641 typeset rc
1642
1643 status=$(get_${VM}_status)
1644 if (( ${?} == 0 ))
1645 then
1646 if echo ${status} | ${GREP} -q -E "^active$|^suspending|^resuming|^suspended|^starting" > /dev/null 2>&1
1647 then
1648 ${HATIMERUN} -t ${MAX_STOP_TIMEOUT} -k KILL ${LDM} stop-domain ${DOMAIN} >> $LOGFILE 2>&1
1649 rc=${?}
1650 else
1651 # domain is already stopped
1652 rc=0
1653 fi
1654 else
1655 # domain is not present.
1656 rc=2
1657 fi
1658
1659 debug_message "Function: shutdown_ldom - Begin"
1660 return ${rc}
1661 }
1662
1663 domain_shutdown()
1664 {
1665 debug_message "Function: domain_shutdown - Begin"
1666 ${SET_DEBUG}
1667
1668 typeset rc
1669
1670 # Corordinate with the domain OS to perform a graceful shutdown.
1671 # Note that the virsh shutdown command returns before the domain
1672 # has shutdown, as such we do not use hatimerun.
1673
1674 shutdown_${VM}
1675 rc=${?}
1676 if (( ${rc} == 2 ))
1677 then
1678 debug_message "Function: domain_shutdown - End"
1679 return 0
1680 elif (( ${rc} == 0 ))
1681 then
1682 # Loop to test if the domain shuts down gracefully
1683 # or if the shutdown time is exceeded.
1684
1685 while (( ${SECONDS} < ${MAX_STOP_TIMEOUT} ))
1686 do
1687 if is_${VM}_up
1688 then
1689 sleep 5
1690 else
1691 SECONDS=${MAX_STOP_TIMEOUT}
1692 fi
1693 done
1694
1695 if is_${VM}_up
1696 then
1697 # SCMSGS
1698 # @explanation
1699 # The domain failed to shutdown gracefully.
1700 # @user_action
1701 # None required. The domain failed to shutdown
1702 # gracefully and will now be immediately terminated.
1703 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1704 "Domain %s failed to shutdown gracefully, immediate shutdown will now be performed." \
1705 "${DOMAIN}"
1706
1707 destroy_${VM}
1708 rc=${?}
1709 else
1710 # SCMSGS
1711 # @explanation
1712 # The domain was shutdown gracefully.
1713 # @user_action
1714 # None required. The domain has shutdown gracefully.
1715 scds_syslog -p daemon.info -t $(syslog_tag) -m \
1716 "Domain %s has been gracefully shutdown." \
1717 "${DOMAIN}"
1718 rc=0
1719 fi
1720
1721 else
1722 # error already logged
1723 destroy_${VM}
1724 rc=${?}
1725 fi
1726
1727 # If the domain has successfully shutdown, we will now delete the domain.
1728 #
1729 # Doing this ensures that the domain is only defined and able to be started
1730 # on one cluster node at a time. Domains can use shared storage between cluster
1731 # nodes so it is very important that we prevent any data corruption if a domain
1732 # gets manually started on multiple cluster nodes where shared storage is used.
1733 #
1734 # Of course using SUNW.HAStoragePlus somewhat protects against this, however we
1735 # simply want to avoid any manual administrative errors performed by mistake.
1736 #
1737 # Note, unless the domain was migrated or live migrated, the domain is defined
1738 # before startup using a previously dumped XML file for the administrative file
1739 # system.
1740
1741 (( ${rc} == 0 )) && domain_delete
1742
1743 debug_message "Function: domain_shutdown - End"
1744 return ${rc}
1745 }
1746
1747 #
1748 # routines to destroy domain
1749 #
1750 destroy_xvm()
1751 {
1752 debug_message "Function: destroy_xvm - Begin"
1753 ${SET_DEBUG}
1754
1755 typeset rc
1756
1757 if ${VIRSH} destroy ${DOMAIN} >> $LOGFILE 2>&1
1758 then
1759 # SCMSGS
1760 # @explanation
1761 # The domain was immediately terminated.
1762 # @user_action
1763 # None required. The domain had previously failed to shutdown
1764 # gracefully but has now been immediately terminated.
1765 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1766 "Domain %s has been immediately terminated." \
1767 "${DOMAIN}"
1768 rc=0
1769 else
1770 # SCMSGS
1771 # @explanation
1772 # The /usr/bin/virsh destroy command failed.
1773 # @user_action
1774 # Determine why it was not possible to immediately terminate
1775 # the domain.
1776 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1777 "Domain %s failed to shutdown immediately." \
1778 "${DOMAIN}"
1779 rc=1
1780 fi
1781
1782 debug_message "Function: destroy_xvm - End"
1783 return ${rc}
1784 }
1785
1786 destroy_ldom()
1787 {
1788 debug_message "Function: destroy_ldom - Begin"
1789 ${SET_DEBUG}
1790
1791 typeset rc
1792
1793 if ${LDM} stop-domain -f ${DOMAIN} >> $LOGFILE 2>&1
1794 then
1795 # SCMSGS
1796 # @explanation
1797 # The domain was immediately terminated.
1798 # @user_action
1799 # None required. The domain had previously failed to shutdown
1800 # gracefully but has now been immediately terminated.
1801 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1802 "Domain %s has been forcefully terminated." \
1803 "${DOMAIN}"
1804 rc=0
1805 else
1806 # SCMSGS
1807 # @explanation
1808 # The /opt/SUNWldm/bin/ldm stop-domain "-f" command failed.
1809 # @user_action
1810 # Determine why it was not possible to forcefully stop
1811 # the domain.
1812 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1813 "Domain %s failed to do a forceful shutdown." \
1814 "${DOMAIN}"
1815 rc=1
1816 fi
1817
1818 debug_message "Function: destroy_ldom - End"
1819 return ${rc}
1820 }
1821
1822 #
1823 # routines to remove domains from the node
1824 #
1825 domain_delete()
1826 {
1827 debug_message "Function: domain_delete - Begin"
1828 ${SET_DEBUG}
1829
1830 # The purpose of deleting the domain after shutdown is to avoid the possibility of
1831 # someone manually starting the domain on a different node. Doing so would compromise
1832 # the domain if shared storage was used for the domain. The domain's configuration
1833 # is always dumped to the agent's administrative file system so that the domain can
1834 # be defined before startup.
1835
1836 typeset rc
1837
1838 if delete_${VM}
1839 then
1840 # SCMSGS
1841 # @explanation
1842 # The domain was deleted.
1843 # @user_action
1844 # None required. The domain has been deleted as it
1845 # will be defined on another node. Deleting the domain
1846 # on this node ensures that it can't be started on
1847 # more than one cluster node at a time.
1848 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1849 "Domain %s has been deleted on this node." \
1850 "${DOMAIN}"
1851 rc=0
1852 else
1853 # error already logged.
1854 rc=1
1855 fi
1856
1857 debug_message "Function: domain_delete - End"
1858 return ${rc}
1859 }
1860
1861 delete_xvm()
1862 {
1863 debug_message "Function: delete_xvm - Begin"
1864 ${SET_DEBUG}
1865
1866 typeset rc=0
1867
1868 if ! /usr/sbin/xm delete ${DOMAIN} >> $LOGFILE 2>&1
1869 then
1870 # SCMSGS
1871 # @explanation
1872 # The /usr/sbin/xm delete command failed.
1873 # @user_action
1874 # Determine why it was not possible to delete the domain.
1875 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1876 "Failed to delete domain %s on this node." \
1877 "${DOMAIN}"
1878 rc=1
1879 fi
1880
1881 debug_message "Function: delete_xvm - End"
1882 return ${rc}
1883 }
1884
1885 delete_ldom()
1886 {
1887 debug_message "Function: delete_ldom - Begin"
1888 ${SET_DEBUG}
1889
1890 if get_${VM}_status | ${GREP} -q -E "^bound$" > /dev/null 2>&1
1891 then
1892
1893 # if the domain is in bound state, unbind it.
1894 ${LDM} unbind-domain ${DOMAIN} >> $LOGFILE 2>&1
1895
1896 if (( ${?} != 0 ))
1897 then
1898 # SCMSGS
1899 # @explanation
1900 # The /opt/SUNWldm/bin/ldm unbind-domain command failed.
1901 # @user_action
1902 # Determine why it was not possible to unbind the domain.
1903 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1904 "Failed to unbind domain %s on this node." \
1905 "${DOMAIN}"
1906
1907 debug_message "Function: delete_ldom - End"
1908 return 1
1909 fi
1910 fi
1911
1912 if ! ${LDM} remove-domain ${DOMAIN} >> $LOGFILE 2>&1
1913 then
1914 # SCMSGS
1915 # @explanation
1916 # The /opt/SUNWldm/bin/ldm remove-domain command failed.
1917 # @user_action
1918 # Determine why it was not possible to remove the domain.
1919 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1920 "Failed to remove domain %s on this node." \
1921 "${DOMAIN}"
1922
1923 debug_message "Function: delete_ldom - End"
1924 return 1
1925 fi
1926
1927 debug_message "Function: delete_ldom - End"
1928 return 0
1929 }