1 #!/bin/ksh
2 #
3 # CDDL HEADER START
4 #
5 # The contents of this file are subject to the terms of the
6 # Common Development and Distribution License (the License).
7 # You may not use this file except in compliance with the License.
8 #
9 # You can obtain a copy of the license at usr/src/CDDL.txt
10 # or http://www.opensolaris.org/os/licensing.
11 # See the License for the specific language governing permissions
12 # and limitations under the License.
13 #
14 # When distributing Covered Code, include this CDDL HEADER in each
15 # file and include the License file at usr/src/CDDL.txt.
16 # If applicable, add the following below this CDDL HEADER, with the
17 # fields enclosed by brackets [] replaced with your own identifying
18 # information: Portions Copyright [yyyy] [name of copyright owner]
19 #
20 # CDDL HEADER END
21 #
22 # Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 # Use is subject to license terms.
24 #
25 # ident "%Z%%M% %I% %E% SMI"
26 #
27
28 PKG=SUNWscxvm
29 TASK_COMMAND=""
30 RESOURCE_PROJECT_NAME=""
31 CCR_TABLE=${VM}_"domain_config"
32 TMP_DIR="/var/tmp"
33 LOGFILE=${TMP_DIR}/${RESOURCE}_logfile
34
35 # Commands definition
36 SCLOGGER=/usr/cluster/lib/sc/scds_syslog
37 LOGGER=/usr/bin/logger
38 GREP=/usr/xpg4/bin/grep
39 AWK=/usr/bin/awk
40 PGREP=/usr/bin/pgrep
41 SLEEP=/usr/bin/sleep
42 TR=/usr/xpg4/bin/tr
43 SCHA_RESOURCE_GET=/usr/cluster/bin/scha_resource_get
44 SCHA_RESOURCEGROUP_GET=/usr/cluster/bin/scha_resourcegroup_get
45 SCHA_CLUSTER_GET=/usr/cluster/bin/scha_cluster_get
46 HATIMERUN=/usr/cluster/bin/hatimerun
47 LDM=/opt/SUNWldm/bin/ldm
48 VIRSH=/usr/bin/virsh
49 XM=/usr/sbin/xm
50 CCRADM=/usr/cluster/lib/sc/ccradm
51 CL_EXEC_CLIENT=/usr/cluster/lib/sc/cl_exec_client
52
53 syslog_tag()
54 {
55 ${SET_DEBUG}
56 print "SC[${PKG:-??}.${METHOD:-??}]:${RESOURCEGROUP:-??}:${RESOURCE:-??}"
57 }
58
59 scds_syslog()
60 {
61 if [ -f "${SCLOGGER}" ]
62 then
63 ${SCLOGGER} "$@" &
64 else
65 while getopts 'p:t:m' opt
66 do
67 case "${opt}" in
68 t) TAG=${OPTARG};;
69 p) PRI=${OPTARG};;
70 esac
71 done
72
73 shift $((${OPTIND} - 1))
74 LOG_STRING=$(/usr/bin/printf "$@")
75 ${LOGGER} -p ${PRI} -t ${TAG} ${LOG_STRING}
76 fi
77
78 if [[ "${METHOD}" == "validate" ]]
79 then
80 shift 5
81 /usr/bin/printf "$@"
82 fi
83 }
84
85 debug_message()
86 {
87 typeset DEBUG_TEXT=
88
89 case ${DEBUG_LEVEL} in
90 0) # No debug msgs
91 SET_DEBUG=
92 ;;
93 1) # Begin and End msgs
94 SET_DEBUG=
95 DEBUG_TEXT=$(echo ${1} | ${GREP} -E 'Begin|End')
96 ;;
97 2) # All debug msgs
98 SET_DEBUG="set -x"
99 DEBUG_TEXT=${1}
100 ;;
101 esac
102
103 [[ -n "${DEBUG_TEXT}" ]] && \
104 scds_syslog -p daemon.debug -t $(syslog_tag) -m \
105 "%s" "${DEBUG_TEXT}"
106 }
107
108 log_message()
109 {
110 #
111 # Output a message to syslog as required
112 #
113
114 debug_message "Function: log_message - Begin"
115
116 ${SET_DEBUG}
117
118 if [ -s "${LOGFILE}" ]
119 then
120 PRIORITY=${1}
121 HEADER=${2}
122
123 #
124 # Ensure that the while loop only reads a closed file
125 #
126 strings ${LOGFILE} > ${LOGFILE}.copy
127 while read MSG_TXT
128 do
129 scds_syslog -p daemon.${PRIORITY} -t $(syslog_tag) -m \
130 "%s - %s" "${HEADER}" "${MSG_TXT}"
131 done < ${LOGFILE}.copy
132 fi
133
134 cat /dev/null > ${LOGFILE} > /dev/null
135 cat /dev/null > ${LOGFILE}.copy
136
137 debug_message "Function: log_message - End"
138 }
139
140
141 get_resource_property()
142 {
143 debug_message "Function: get_resource_property - Begin"
144 ${SET_DEBUG}
145
146 typeset RS=${1}
147 typeset PROPERTY=${2}
148 typeset rc
149
150 # Retrieve the property value.
151 OUTPUT=$(${SCHA_RESOURCE_GET} -O Extension -R ${RS} ${PROPERTY})
152 rc=${?}
153
154 debug_message "get_resource_property - " \
155 "scha_resource_get of property ${PROPERTY} returned ${rc}"
156
157 if (( ${rc} == 0 ))
158 then
159 # print the values
160 echo ${OUTPUT} | ${AWK} '{ \
161 if (NF > 1) for (i = 2; i <= NF; i++) print $i; else print "" }'
162 fi
163
164 debug_message "Function: get_resource_property - End"
165
166 return ${rc}
167 }
168
169
170 get_properties()
171 {
172 debug_message "Function: get_properties - Begin"
173 ${SET_DEBUG}
174
175 typeset -i rc
176 typeset props=$*
177
178 for prop in ${props}
179 do
180 # retrieve the property value
181 typeset val=$(get_resource_property ${RESOURCE} ${prop})
182 rc=${?}
183
184 if (( ${rc} == 0 ))
185 then
186 case ${prop} in
187 Domain_name) [[ -z ${DOMAIN} ]] && DOMAIN=${val};;
188 Migration_type) [[ -z ${MIGRATION_TYPE} ]] && MIGRATION_TYPE=${val};;
189 Plugin_probe) [[ -z ${PLUGIN_PROBE} ]] && PLUGIN_PROBE=${val};;
190 Password_file) [[ -z ${PASSWORD_FILE} ]] && PASSWORD_FILE=${val};;
191 Debug_level) [[ -z ${DEBUG_LEVEL} ]] && DEBUG_LEVEL=${val};;
192 esac
193 else
194 # SCMSGS
195 # @explanation
196 # The scha_resource_get call failed.
197 # @user_action
198 # Check the syslog for further messages.
199 scds_syslog -p daemon.error -t $(syslog_tag) -m \
200 "Cannot get the property %s of resource %s." \
201 "${prop}" "${RESOURCE}"
202 break
203 fi
204 done
205
206 debug_message "Function: get_properties - End"
207
208 return ${rc}
209 }
210
211 validate_xvm()
212 {
213 debug_message "Function: validate_xvm - Begin"
214 ${SET_DEBUG}
215
216 typeset rc=0
217
218 if [ "$(/usr/bin/uname -i)" != "i86xpv" ]
219 then
220 # SCMSGS
221 # @explanation
222 # Solaris is not booted with xVM.
223 # @user_action
224 # Ensure that the default boot grub menu is set to boot
225 # Solaris xVM.
226 scds_syslog -p daemon.error -t $(syslog_tag) -m \
227 "Node is not booted with xVM."
228
229 rc=1
230 fi
231
232 debug_message "Function: validate_xvm - End"
233
234 return ${rc}
235 }
236
237 validate_ldom()
238 {
239 debug_message "Function: validate_ldom - Begin"
240 ${SET_DEBUG}
241
242 typeset ncount=0
243
244 # Make sure that the password file is readable.
245 if [ ! -r "${PASSWORD_FILE}" ]
246 then
247 # SCMSGS
248 # @explanation
249 # Incorrect Password file specified.
250 # @user_action
251 # Ensure that a valid password file is specified.
252 scds_syslog -p daemon.error -t $(syslog_tag) -m \
253 "Invalid password file specified %s." \
254 "${PASSWORD_FILE}"
255
256 debug_message "Function: validate_ldom - End"
257 return 1
258 fi
259
260 # Ensure that the control domain is a cluster node.
261 if ! ${LDM} ls > /dev/null 2>&1
262 then
263 # SCMSGS
264 # @explanation
265 # Self explanatory.
266 # @user_action
267 # Ensure that the resource is configured in
268 # control domain.
269 scds_syslog -p daemon.error -t $(syslog_tag) -m \
270 "The LDom Manager is running in configuration mode."
271
272 debug_message "Function: validate_ldom - End"
273 return 1
274 fi
275
276 # Ensure that the failure-policy setting is set to "reset".
277 # If the control domain fails,this would allow the guest domains
278 # to panic.
279 policy=$(${LDM} list -o domain primary \
280 | ${AWK} -F"=" '$1~/failure-policy/ {print $2}')
281
282 if [ "${policy}" != "reset" ]
283 then
284 # SCMSGS
285 # @explanation
286 # Incorrect failure-policy setting for the domain.
287 # @user_action
288 # Ensure that the failure-policy for the domain is
289 # set to "reset" on the control domain.
290 scds_syslog -p daemon.error -t $(syslog_tag) -m \
291 "Invalid failure policy \"%s\" for %s domain." \
292 "${policy}" "primary"
293
294 debug_message "Function: validate_ldom - End"
295 return 1
296 fi
297
298 # The CL_EXEC_CLIENT program executes a command on any of the
299 # cluster nodes or a zone or in a zone cluster. It then generates
300 # as output the exit status of command and the stdout and stderr
301 # messages. The valid options are:
302 # [ -z zoneclustername] The command is run on the zone cluster
303 # represented by the zonename.
304 # -C { TS | RT | FSS | FX } The scheduling class in which the
305 # command is to be run.
306 # -p pri Specifies the priority of the command in the given
307 # scheduling class.
308 # -n id[,id..] A comma seperated list of node ID's of a
309 # zone cluster or a node to run the command.
310 # -c cmd [Args] The command to be run along with its arguments.
311
312 for nodename in $(${SCHA_RESOURCEGROUP_GET} -O NODELIST -G ${RESOURCEGROUP})
313 do
314 if [[ "$(${SCHA_CLUSTER_GET} -O NodeState_Node ${nodename})" == "DOWN" ]]
315 then
316 continue
317 fi
318
319 nodeid=$(${SCHA_CLUSTER_GET} -O NODEID_NODENAME ${nodename})
320 output=$(${CL_EXEC_CLIENT} -n ${nodeid} -c "${LDM} list-domain ${DOMAIN}")
321 result=${?}
322
323 status=$(echo ${output} | ${AWK} '{print $6}')
324
325 if (( ${result} == 0 )) && (( ${status} == 0 ))
326 then
327 domstate=$(echo $output | ${AWK} -F" " '{print $18}')
328
329 if (( ${update} == 0)) && echo $domstate | ${GREP} -q -E "^active$|suspending|resuming|suspended|starting" > /dev/null 2>&1
330 then
331 # SCMSGS
332 # @explanation
333 # The domain is in an invalid state.
334 # @user_action
335 # Ensure that the domain is in inactive or bound state.
336 scds_syslog -p daemon.error -t $(syslog_tag) -m \
337 "Domain %s is in %s state on %s." \
338 "${DOMAIN}" "${domstate}" "${nodename}"
339
340 debug_message "Function: validate_ldom - End"
341 return 1
342 fi
343
344 ncount=$((ncount+1))
345 nlist=$(echo ${nodename} ${nlist})
346
347 # dump domain confguration to ccr
348 if [[ "$(/usr/bin/hostname)" == "${nodename}" ]]
349 then
350 if ! dump_domain_config
351 then
352 debug_message "Function: validate_ldom - End"
353 return 1
354 fi
355 fi
356 fi
357 done
358
359 if (( ${ncount} == 0 ))
360 then
361 if ! ${CCRADM} showkey --key xml_${RESOURCE} ${CCR_TABLE} > /dev/null 2>&1
362 then
363 scds_syslog -p daemon.error -t $(syslog_tag) -m \
364 "Domain %s does not exist." \
365 "${DOMAIN}"
366 return 1
367 fi
368 fi
369
370 if [[ ${ncount} -gt 1 ]]
371 then
372 # SCMSGS
373 # @explanation
374 # The domain is configured on multiple
375 # cluster nodes.
376 # @user_action
377 # Ensure that the domain is configured on one node
378 # of the cluster.
379 scds_syslog -p daemon.error -t $(syslog_tag) -m \
380 "Multiple domain %s configuration exists on %s." \
381 "${DOMAIN}" "${nlist}"
382 return 1
383 fi
384
385 debug_message "Function: validate_ldom - End"
386 return 0
387 }
388
389 validate()
390 {
391 debug_message "Function: validate - Begin"
392 ${SET_DEBUG}
393
394 typeset rc
395
396 # Make sure that the plugin probe specified is readable.
397 if [[ -n "${PLUGIN_PROBE}" ]]
398 then
399 if [ -f "${PLUGIN_PROBE}" ] && [ ! -r "${PLUGIN_PROBE}" ]
400 then
401 # SCMSGS
402 # @explanation
403 # Incorrect user probe file specified.
404 # @user_action
405 # Ensure that a valid user probe file is specified.
406 scds_syslog -p daemon.error -t $(syslog_tag) -m \
407 "Invalid user probe file %s." \
408 "${PLUGIN_PROBE}"
409
410 return 1
411 fi
412 fi
413
414 validate_${VM}
415 rc=${?}
416
417 debug_message "Function: validate - End"
418 return ${rc}
419 }
420
421 #
422 # get the domain status
423 #
424 get_xvm_status()
425 {
426 debug_message "Function: get_xvm_status - Begin"
427 ${SET_DEBUG}
428
429 typeset rc
430
431 ${VIRSH} domstate ${DOMAIN}
432 rc=${?}
433
434 debug_message "Function: get_xvm_status - End"
435 return ${rc}
436 }
437
438 get_ldom_status()
439 {
440 debug_message "Function: get_ldom_status - Begin"
441 ${SET_DEBUG}
442
443 typeset rc=1
444
445 OUTPUT=$(${LDM} list-domain ${DOMAIN})
446
447 if (( ${?} == 0 ))
448 then
449 echo ${OUTPUT} | ${AWK} '{print $10}'
450 rc=${?}
451 fi
452
453 debug_message "Function: get_ldom_status - End"
454 return ${rc}
455 }
456
457 #
458 # Routines to create the domain on the current cluster node.
459 #
460 add_xvm_domain()
461 {
462 debug_message "Function: add_xvm_domain - Begin"
463 ${SET_DEBUG}
464
465 typeset rc=0
466
467 if ! ${VIRSH} define ${TMP_DIR}/${RESOURCE}.xml >> $LOGFILE 2>&1
468 then
469 # SCMSGS
470 # @explanation
471 # Defining the domain using an XML file failed.
472 # @user_action
473 # The command /usr/bin/virsh define failed to define the domain.
474 # Determine if you have specified the correct domain name while
475 # registering the resource.
476 scds_syslog -p daemon.error -t $(syslog_tag) -m \
477 "Failed to define %s using %s/%s.xml." \
478 "${DOMAIN}" "${TMP_DIR}" "${RESOURCE}"
479 rc=1
480 fi
481
482 debug_message "Function: add_xvm_domain - End"
483 return ${rc}
484 }
485
486 add_ldom_domain()
487 {
488 debug_message "Function: add_ldom_domain - Begin"
489 ${SET_DEBUG}
490
491 typeset rc=0
492
493 if ! ${LDM} add-domain -i ${TMP_DIR}/${RESOURCE}.xml ${DOMAIN} >> $LOGFILE 2>&1
494 then
495 # SCMSGS
496 # @explanation
497 # Defining the domain using an XML file failed.
498 # @user_action
499 # The command /opt/SUNWldm/bin/ldm "add-domain"
500 # failed to define the domain. Determine if you
501 # have specified the correct domain name when
502 # registering the resource.
503 scds_syslog -p daemon.error -t $(syslog_tag) -m \
504 "Failed to add the domain %s using %s/%s.xml." \
505 "${DOMAIN}" "${TMP_DIR}" "${RESOURCE}"
506 rc=1
507 fi
508
509 debug_message "Function: add_ldom_domain - End"
510 return ${rc}
511 }
512
513 #
514 # test if domain is active
515 #
516 is_xvm_up()
517 {
518 debug_message "Function: is_xvm_up - Begin"
519 ${SET_DEBUG}
520
521 typeset rc=0
522
523 echo $(${VIRSH} domstate ${DOMAIN}) | \
524 ${GREP} -q -E "running|blocked|paused|in shutdown" > /dev/null 2>&1
525 rc=${?}
526
527 debug_message "Function: is_xvm_up - End"
528 return ${rc}
529 }
530
531 is_ldom_up()
532 {
533 debug_message "Function: is_ldom_up - Begin"
534 ${SET_DEBUG}
535
536 typeset rc=0
537
538 get_ldom_status | ${GREP} -q -E "^active$|^starting$" > /dev/null 2>&1
539 rc=${?}
540
541 debug_message "Function: is_ldom_up - End"
542 return ${rc}
543 }
544
545 #
546 # wrapper routines to start xvm or ldom domains
547 #
548 start_xvm()
549 {
550 debug_message "Function: start_xvm - Begin"
551 ${SET_DEBUG}
552
553 typeset rc=0
554
555 ${VIRSH} start ${DOMAIN} >> $LOGFILE 2>&1
556 rc=${?}
557
558 debug_message "Function: start_xvm - End"
559 return ${rc}
560 }
561
562 #
563 # After a crash/reboot of the node, the domain
564 # would be started and there would be multiple
565 # instances of the same domain across cluster
566 # nodes. Hence the domain is destroyed.
567 #
568 init_ldom()
569 {
570 debug_message "Function: init_ldom - Begin"
571 ${SET_DEBUG}
572
573 typeset rc
574
575 MAX_STOP_TIMEOUT=$(${SCHA_RESOURCE_GET} -O INIT_TIMEOUT \
576 -R ${RESOURCE} -G ${RESOURCEGROUP} )
577
578 domain_shutdown
579 rc=${?}
580
581 debug_message "Function: init_ldom - End"
582 return ${rc}
583 }
584
585 start_ldom()
586 {
587 debug_message "Function: start_ldom - Begin"
588 ${SET_DEBUG}
589
590 typeset rc=0
591
592 if get_${VM}_status | ${GREP} -q -E "^inactive$" > /dev/null 2>&1
593 then
594 if ${LDM} bind-domain ${DOMAIN} >> $LOGFILE 2>&1
595 then
596 # SCMSGS
597 # @explanation
598 # The domain was bound.
599 # @user_action
600 # None required. The domain has been bound on this node.
601 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
602 "Domain %s is bound." \
603 "${DOMAIN}"
604 rc=0
605 else
606 # SCMSGS
607 # @explanation
608 # The /opt/SUNWldm/bin/ldm bind-domain command failed.
609 # @user_action
610 # Determine why it was not possible to bind the domain.
611 scds_syslog -p daemon.error -t $(syslog_tag) -m \
612 "Failed to bind %s." \
613 "${DOMAIN}"
614 rc=1
615 fi
616
617 fi
618
619 #
620 # The domain is made to sit at the OBP prompt, so a reboot/crash
621 # wouldn't boot the Guest domain OS.
622 #
623 if (( ${rc} == 0 )) && ${LDM} set-var auto-boot?=true ${DOMAIN} >> $LOGFILE 2>&1
624 then
625 if ${LDM} start-domain ${DOMAIN} >> $LOGFILE 2>&1
626 then
627 while [ 1 ]
628 do
629 flag=$(${LDM} list-domain -p ${DOMAIN} | ${GREP} ${DOMAIN} \
630 | ${AWK} -F"|" '{print $4}'| ${AWK} -F"=" '{print $2}')
631 [[ "${flag}" == "-n----" ]] && break
632 ${SLEEP} 1
633 done
634 else
635 rc=1
636 fi
637 ${LDM} set-var auto-boot?=false ${DOMAIN} >> $LOGFILE 2>&1 || rc=1
638 else
639 rc=1
640 fi
641
642 debug_message "Function: start_ldom - End"
643 return ${rc}
644 }
645
646 start_domain()
647 {
648 debug_message "Function: start_domain - Begin"
649 ${SET_DEBUG}
650
651 typeset rc=0
652
653 # Turn off PMF restart. Starting a domain does not leave
654 # a running pid as in a classic Solaris Cluster agent.
655
656 START_TIMEOUT=$(${SCHA_RESOURCE_GET} -O START_TIMEOUT \
657 -R ${RESOURCE} -G ${RESOURCEGROUP} )
658
659 ${SLEEP} ${START_TIMEOUT} &
660 /usr/cluster/bin/pmfadm -s ${RESOURCEGROUP},${RESOURCE},0.svc
661
662 # Check if the domain exists.
663 #
664 # If the domain does not exist, we maybe starting the domain
665 # on a new cluster node following a failover. As such we will
666 # define the domain using the previously dumped XML file
667 # located within the agent's administrative file system.
668 #
669 # If the domain already exists, either the domain was manually
670 # started or the domain was migrated or live migrated from
671 # another cluster node. Therefore, we will use the already
672 # defined domain.
673 #
674 # Note that when the domain is successfully stopped the domain
675 # is deleted. We do this simply to avoid the domain from
676 # being manually started on multiple cluster nodes. See
677 # domain_delete() for more information.
678
679 if get_${VM}_status > /dev/null 2>&1
680 then
681 debug_message "Validate - domain ${DOMAIN} exists"
682 else
683 if ${CCRADM} showkey --key xml_${RESOURCE} ${CCR_TABLE} > ${TMP_DIR}/${RESOURCE}.xml 2> /dev/null
684 then
685 # add the domain to the cluster node
686 if add_${VM}_domain ${DOMAIN} ${TMP_DIR}/${RESOURCE}.xml
687 then
688 # SCMSGS
689 # @explanation
690 # The domain is being defined using a XML file.
691 # @user_action
692 # None, the domain is being defined using a previously defined
693 # XML file when the domain was last successfully started.
694 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
695 "Domain %s defined using %s/%s.xml." \
696 "${DOMAIN}" "${TMP_DIR}" "${RESOURCE}"
697 else
698 # error already logged.
699 debug_message "Function: start_domain - End"
700 return 1
701 fi
702 else
703 # SCMSGS
704 # @explanation
705 # The domain does not exist.
706 # @user_action
707 # You must ensure that the domain exists.
708 scds_syslog -p daemon.error -t $(syslog_tag) -m \
709 "Domain %s does not exist." \
710 "${DOMAIN}"
711
712 debug_message "Function: start_domain - End"
713 return 1
714 fi
715 fi
716
717 # Tolerate a manually started domain and a NO-OP start
718 # otherwise start the domain.
719
720 if ${CCRADM} showkey --key noop_${RESOURCE} ${CCR_TABLE} > /dev/null 2>&1
721 then
722 # SCMSGS
723 # @explanation
724 # The domain was migrated or live migrated.
725 # @user_action
726 # None required. Informational message.
727 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
728 "NO-OP START being performed."
729
730 if ! ${CCRADM} delkey --key noop_${RESOURCE} ${CCR_TABLE} >> $LOGFILE 2>&1
731 then
732 # SCMSGS
733 # @explanation
734 # Failed to delete the NO-OP flag from CCR.
735 # @user_action
736 # Check the syslog for further messages.
737 # Determine why the NO-OP flag was not added to the CCR.
738 scds_syslog -p daemon.error -t $(syslog_tag) -m \
739 "Failed to delete NO-OP flag for %s domain." \
740 "${DOMAIN}"
741
742 debug_message "Function: start_domain - End"
743 return 1
744 else
745 debug_message "start_domain - noop_${RESOURCE} deleted"
746 fi
747
748 elif is_${VM}_up
749 then
750 # SCMSGS
751 # @explanation
752 # The domain was manually started.
753 # @user_action
754 # None required. Informational message.
755 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
756 "Domain %s was manually started." \
757 "${DOMAIN}"
758 else
759 if start_${VM}
760 then
761 # SCMSGS
762 # @explanation
763 # The domain was started successfully.
764 # @user_action
765 # None required. Informational message.
766 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
767 "Domain %s started." \
768 "${DOMAIN}"
769 else
770 # SCMSGS
771 # @explanation
772 # The domain failed to start.
773 # @user_action
774 # Check the syslog for further messages. If possible
775 # the cluster will attempt to restart the domain.
776 scds_syslog -p daemon.error -t $(syslog_tag) -m \
777 "Domain %s failed to start." \
778 "${DOMAIN}"
779
780 rc=1
781 fi
782 fi
783
784 if (( ${rc} == 0 ))
785 then
786 # Dump the domain configuration into an XML file. This file is then
787 # used on another cluster node to define the domain but only if the
788 # domain does not exist.
789
790 dump_domain_config
791 rc=${?}
792 fi
793
794 debug_message "Function: start_domain - End"
795 return ${rc}
796 }
797
798 #
799 # dump the domain configuration
800 #
801 dump_xvm_xml()
802 {
803 debug_message "Function: dump_xvm_xml - Begin"
804 ${SET_DEBUG}
805
806 typeset rc=0
807
808 if ! ${VIRSH} dumpxml ${DOMAIN} 2>> $LOGFILE
809 then
810 # SCMSGS
811 # @explanation
812 # "/usr/bin/virsh dumpxml" for domain failed.
813 # @user_action
814 # Determine why the command to dump domain
815 # configuration failed.
816 scds_syslog -p daemon.error -t $(syslog_tag) -m \
817 "%s dumpxml for domain %s failed." \
818 "${VIRSH}" "${DOMAIN}"
819 rc=${?}
820 fi
821
822 debug_message "Function: dump_xvm_xml - End"
823 return ${rc}
824 }
825
826 dump_ldom_xml()
827 {
828 debug_message "Function: dump_ldom_xml - Begin"
829 ${SET_DEBUG}
830
831 typeset rc=0
832
833 if ! ${LDM} list-constraints -x ${DOMAIN} 2>> $LOGFILE
834 then
835 # SCMSGS
836 # @explanation
837 # "/opt/SUNWldm/bin/ldm list-constraints -x"
838 # for domain failed.
839 # @user_action
840 # Determine why the command to list the
841 # domain constraints failed.
842 scds_syslog -p daemon.error -t $(syslog_tag) -m \
843 "%s list-constraints for domain %s failed." \
844 "${LDM}" "${DOMAIN}"
845 rc=1
846 fi
847
848 debug_message "Function: dump_ldom_xml - End"
849 return ${rc}
850 }
851
852 #
853 # save the domain configuration in the cluster
854 # configuration repository
855 #
856 dump_domain_config()
857 {
858 debug_message "Function: dump_domain_config - Begin"
859 ${SET_DEBUG}
860
861 typeset rc=0
862
863 # Dump the domain configuration into an XML file. The domain configuration
864 # can be changed, when under the the agent control.
865
866 olddesc=$(${CCRADM} showkey --key xml_${RESOURCE} ${CCR_TABLE} 2> /dev/null)
867
868 if (( ${?} == 1 ))
869 then
870 #
871 # The ccr table might not exist.
872 # create the CCR table, if it doesn't exist.
873 #
874 if ${CCRADM} addtab ${CCR_TABLE} >> $LOGFILE 2>&1
875 then
876 debug_message "created ccr table ${CCR_TABLE}"
877 else
878 # SCMSGS
879 # @explanation
880 # Failed to create the CCR table.
881 # @user_action
882 # Check the syslog for further messages.
883 # Determine why the CCR create failed.
884 scds_syslog -p daemon.error -t $(syslog_tag) -m \
885 "Failed to create CCR table %s." \
886 "${CCR_TABLE}"
887
888 return 1
889 fi
890 fi
891
892 output=$(dump_${VM}_xml)
893 if (( ${?} == 0 )) && [[ -n "${output}" ]]
894 then
895 newdesc=$(echo ${output} | ${TR} -s '\n' '[ ]')
896 if [ "${olddesc}" != "${newdesc}" ]
897 then
898 if ! ${CCRADM} addkey --key=xml_${RESOURCE} --value "${newdesc}" ${CCR_TABLE} > /dev/null 2>&1
899 then
900 if ! ${CCRADM} changekey --key=xml_${RESOURCE} --value "${newdesc}" ${CCR_TABLE} >> $LOGFILE 2>&1
901 then
902 # SCMSGS
903 # @explanation
904 # Failed to update the XMl dump to the CCR.
905 # @user_action
906 # Check the syslog for further messages.
907 # Determine why the ccr update failed.
908 scds_syslog -p daemon.error -t $(syslog_tag) -m \
909 "Failed to update domain XML %s to ccr." \
910 "${DOMAIN}"
911
912 rc=1
913 fi
914 else
915 debug_message "dump_domain_config - %s configuration added to CCR" "${DOMAIN}"
916 fi
917 fi
918 else
919 # error already logged.
920 rc=1
921 fi
922
923 debug_message "Function: dump_domain_config - End"
924
925 return ${rc}
926 }
927
928 #
929 # probe function for domain data service
930 #
931 check_domain()
932 {
933 debug_message "Function: check_domain - Begin"
934 ${SET_DEBUG}
935
936 typeset rc
937 SECONDS=0
938
939 if ${PGREP} -f "control_xvm start -R ${RESOURCE} " >/dev/null 2>&1
940 then
941 debug_message "Function: check_domain - start program is still running "
942 rc=100
943 else
944 domstate=$(get_${VM}_status 2>/dev/null)
945
946 case "${domstate}" in
947
948 # Acceptable run states
949 "running"|"blocked"|"paused"|"in shutdown"| \
950 "active"|"suspending"|"resuming"|"suspended"|"starting")
951
952 if [ "${#PLUGIN_PROBE}" -ne 0 ]
953 then
954 if [ -x "$(echo ${PLUGIN_PROBE} | ${AWK} '{print $1}')" ]
955 then
956 PROBE_TIMEOUT=$(${SCHA_RESOURCE_GET} -O Extension -R ${RESOURCE} -G ${RESOURCEGROUP} Probe_timeout|tail -1)
957 # Run the supplied probe with only 90% of PROBE_TIMEOUT. Also note that this
958 # is supplied as a parameter to the PLUGIN_PROBE.
959
960 HATIMERUN_TIMEOUT=$((PROBE_TIMEOUT*90/100-${SECONDS}))
961
962 output=$(${HATIMERUN} -t ${HATIMERUN_TIMEOUT} -k 9 ${PLUGIN_PROBE} ${HATIMERUN_TIMEOUT})
963 rc=${?}
964
965 case ${rc} in
966 0) debug_message "check_domain - ${DOMAIN} ${output}"
967 rc=0
968 ;;
969 99)
970 # SCMSGS
971 # @explanation
972 # The domain probe timed out.
973 # @user_action
974 # Ensure that ${PLUGIN_PROBE} can complete within
975 # 90% of PROBE_TIMEOUT.
976 scds_syslog -p daemon.error -t $(syslog_tag) -m \
977 "%s did not complete within %s seconds." \
978 "${PLUGIN_PROBE}" "${HATIMERUN_TIMEOUT}"
979
980 rc=100
981 ;;
982 100) if ${PGREP} -f "gds_svc_start .*-R ${RESOURCE} " >/dev/null 2>&1
983 then
984 debug_message "check_domain - ${DOMAIN} is still starting"
985 rc=100
986 elif ${PGREP} -f "gds_svc_stop .*-R ${RESOURCE} " >/dev/null 2>&1
987 then
988 debug_message "check_domain - ${DOMAIN} is stopping"
989 rc=100
990 else
991 # SCMSGS
992 # @explanation
993 # The domain probe has requested a domain restart.
994 # @user_action
995 # None. A domain restart will be attempted.
996 scds_syslog -p daemon.error -t $(syslog_tag) -m \
997 "% has requested a domain restart %s." \
998 "${PLUGIN_PROBE}" "${output}"
999
1000 rc=100
1001 fi
1002 ;;
1003 201) if ${PGREP} -f "gds_svc_start .*-R ${RESOURCE} " >/dev/null 2>&1
1004 then
1005 debug_message "check_domain - ${DOMAIN} is still starting"
1006 rc=100
1007 elif ${PGREP} -f "gds_svc_stop .*-R ${RESOURCE} " >/dev/null 2>&1
1008 then
1009 debug_message "check_domain - ${DOMAIN} is stopping"
1010 rc=100
1011 else
1012 # SCMSGS
1013 # @explanation
1014 # The domain has requested an immediate failover.
1015 # @user_action
1016 # None. The domain will be immediately failed over.
1017 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1018 "%s has requested an immediate failover." \
1019 "${PLUGIN_PROBE}"
1020
1021 rc=201
1022 fi
1023 ;;
1024 *)
1025 # SCMSGS
1026 # @explanation
1027 # ${PLUGIN_PROBE} did not return 0, 100 or 201.
1028 # @user_action
1029 # None. A domain restart will be attempted.
1030 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1031 "%s did not return 0, 100 or 201, a domain restart will be attempted." \
1032 "${PLUGIN_PROBE}"
1033 rc=100
1034 ;;
1035 esac
1036 else
1037 # SCMSGS
1038 # @explanation
1039 # ${PLUGIN_PROBE} does not exist or is not executable.
1040 # @user_action
1041 # Check the pathname exists and that ${PLUGIN_PROBE} is executable.
1042 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1043 "%s non-existent executable." \
1044 "${PLUGIN_PROBE}"
1045
1046 rc=0
1047 fi
1048 else
1049 rc=0
1050 fi
1051
1052 ;;
1053
1054 # Restartable run states
1055
1056 "shut off"|"crashed"| \
1057 "inactive"|"stopping")
1058
1059 rc=100
1060 ;;
1061
1062 # Unknown run states
1063
1064 *)
1065 rc=100
1066 ;;
1067 esac
1068
1069 debug_message "check_domain - ${DOMAIN} ${domstate}"
1070
1071 fi
1072
1073 debug_message "Function: check_domain - End"
1074 return ${rc}
1075 }
1076
1077 stop_domain()
1078 {
1079 debug_message "Function: stop_domain - Begin"
1080 ${SET_DEBUG}
1081
1082 typeset rc=0
1083
1084 STOP_TIMEOUT=$(${SCHA_RESOURCE_GET} -O STOP_TIMEOUT \
1085 -R ${RESOURCE} -G ${RESOURCEGROUP} )
1086
1087 # Note that GDS will attempt to cleanup after 80% of STOP_TIMEOUT
1088 # has been consumed. In this regard, we only allocate a combined
1089 # 75% of STOP_TIMEOUT to MAX_MIGRATE_TIMEOUT and MAX_STOP_TIMEOUT.
1090 #
1091 # This leaves 5% for domain_destroy() which maybe called if
1092 # domain_shutdown() exeecds it's timeout and finally domain_delete().
1093
1094 MAX_MIGRATE_TIMEOUT=$((STOP_TIMEOUT*25/100))
1095 MAX_STOP_TIMEOUT=$((STOP_TIMEOUT*50/100))
1096 SECONDS=0
1097
1098 # Save the domain configuration changes.
1099 if ! dump_domain_config
1100 then
1101 debug_message "Function: stop_domain - End"
1102 return 1
1103 fi
1104
1105 # At resource creation, the administrator can determine the Migration_type.
1106 # Valid values for Migration_type are
1107 #
1108 # Migration_type="normal"
1109 # o Stop the resource (shutdown the domain)
1110 # o Failover the resource group from the source node to the target node
1111 # o Start the resource (start the domain)
1112 #
1113 # Migration_type="migrate"
1114 # o Suspend the domain on the source node
1115 # o Copy the domain's memory pages from the source node to the target node
1116 # o Resume the domain on the target node
1117 #
1118 # Migration_type="migrate_live"
1119 # o Iteratively copy the domain's memory pages from the source node to the taregt node
1120 # o When pre-copy is no longer benefical, suspend the domain on the source node
1121 # o Copy the domain's remaning "dirty" pages from the source node to the taregt node
1122 # o Resume the domain on the target node
1123 #
1124 # Note that migration or live migration is performed over the cluster interconnect.
1125 #
1126 # For migration or live migration to be attempted across Solaris Cluster xVM nodes
1127 # the following conditions must be met.
1128 #
1129 # - The target Solaris Cluster xVM node must be running the same xVM version.
1130 #
1131 # - The migration TCP port must be open and accepting connections from the source
1132 # Solaris Cluster xVM node.
1133 #
1134 # - There must be sufficient resources for the domain to run in.
1135 #
1136 # - If the conditions are met and migration or live migration is successful a NO-OP
1137 # STOP and START is performed. This will ensure a successful STOP and START to the
1138 # appropriate RGM callback methods. Furthermore, doing a NO-OP RGM failover will
1139 # ensure that RGM subsequently actions any dependencies and that Solaris Cluster
1140 # reflects the correct state and status of resource groups and resources.
1141 #
1142 # - If the conditions are met but migration or live migration is not successful a
1143 # normal failover will be performed.
1144 #
1145 # - If the conditions are not met, migration or live migration will fail and a normal
1146 # failover will be performed.
1147 #
1148 # However, before attempting a migration or live migration we need to determine if the
1149 # resource is being disabled. To distinguish if the resource is being disabled we
1150 # test the ON_OFF_SWITCH property of the resource.
1151 #
1152 # If the resource is being disabled the ON_OFF_SWITCH will be DISABLED before the STOP
1153 # method is called. So, conversely if the ON_OFF_SWITCH is ENABLED the resource is not
1154 # being disabled and instead the resource group is undergoing either a switch to
1155 # another node or is being evacuated from the node.
1156 #
1157 # - If the resource is being disabled we perform a normal shutdown, regardless of the
1158 # Migration_type setting.
1159
1160 ON_OFF_SWITCH=$(${SCHA_RESOURCE_GET} -O ON_OFF_SWITCH -R ${RESOURCE} -G ${RESOURCEGROUP})
1161
1162 debug_message "stop_domain - ON_OFF_SWITCH=${ON_OFF_SWITCH}"
1163 debug_message "stop_domain - MIGRATION_TYPE=${MIGRATION_TYPE}"
1164
1165 if [[ "${ON_OFF_SWITCH}" = "DISABLED" ]]
1166 then
1167 domain_shutdown
1168 else
1169 case "${MIGRATION_TYPE}" in
1170 NORMAL) domain_shutdown
1171 rc=${?}
1172 ;;
1173 MIGRATE*) if ! domain_migrate
1174 then
1175 domain_shutdown
1176 fi
1177 rc=${?}
1178 ;;
1179 *)
1180 # SCMSGS
1181 # @explanation
1182 # Invalid Migration_type specified.
1183 # @user_action
1184 # Delete and reregister the resource with
1185 # a valid Migration_type entry.
1186 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1187 "Invalid Migration_type=%s." \
1188 "${MIGRATION_TYPE}"
1189 rc=1
1190 ;;
1191 esac
1192 fi
1193
1194 debug_message "Function: stop_domain - End"
1195 return ${rc}
1196 }
1197
1198 get_target_host()
1199 {
1200 debug_message "Function: get_target_host - Begin"
1201 ${SET_DEBUG}
1202
1203 typeset rc=1
1204
1205 # Here, we need to determine the target host as the resource group is either being
1206 # switched or the node, where the resoure group is online, is being evacuated.
1207 #
1208 # To determine the target host for a resource group switch we rely on the cluster
1209 # command log file /var/cluster/logs/commandlog to supply the target host. We need to
1210 # obtain the correct entry from the command log file and match against the following
1211 #
1212 # <date> + ${RESOURCEGROUP} + "START" + "switch"
1213 #
1214 # after which we only save the nodename from a clrg or scswitch command.
1215 #
1216 # Sample /var/cluster/log/commandlog output is as follows,
1217 #
1218 # 02/07/2008 08:45:13 pelko1 10548 root START - scswitch -z -g "xvm2-rg" -h "pelko2"
1219 # 02/07/2008 08:45:38 pelko1 10548 root END 0
1220 # 02/07/2008 09:01:35 pelko1 10874 root START - clrg "switch" -n "pelko1" "xvm2-rg"
1221 # 02/07/2008 09:01:36 pelko1 10874 root END -20827641
1222 #
1223 # If we are unable to match an entry, as perhaps the entry was logged at <date>
1224 # and we are checking at <date> + 1 second, i.e. we are checking just as the second
1225 # entry is incrementing to the next second, we perform another check. In fact the
1226 # last 10 seconds are checked from the commandlog.
1227 #
1228 # Once we have matched an entry from /var/cluster/logs/commandlog, we verify that
1229 # the target host is a valid nodelist entry for the resource group.
1230 #
1231 # - If we have a valid nodelist entry we then determine that target host's cluster
1232 # interconnect hostname to perform the migration or live migration.
1233 #
1234 # - If we are unable to find a match for a switch, we need to consider that an evacuate
1235 # node is being performed. However, if the node is being evacuated we will rely on
1236 # RGM to dertermine the nodename regardless if a mirgation or live migration was
1237 # requested. Subsequently, we perform a normal failover. This ensures that we do not
1238 # migrate or live migrate the domain to a node that maybe different to the node
1239 # selected by RGM.
1240 #
1241 # So, suffice to say that if a "switch" match is not found, following the discovery
1242 # that the resource is not just being disabled, and that a migrate or live migrate
1243 # was defined, we will always perform a normal failover.
1244 #
1245 # Note that the target host match is performed within check_commandlog().
1246
1247 check_commandlog
1248
1249 debug_message "get_target_host - ${TARGET_HOST} size=${#TARGET_HOST}"
1250
1251 if [ "${#TARGET_HOST}" -eq 0 ]
1252 then
1253 # SCMSGS
1254 # @explanation
1255 # A target host was not found
1256 # @user_action
1257 # None required. The domain will not be migrated or live
1258 # migrated instead a normal failover will be performed.
1259 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1260 "Target host not found, normal failover will be performed."
1261
1262 elif [ ${TARGET_HOST} = "$(/usr/bin/uname -n)" ] || [ $(echo ${TARGET_HOST} | /usr/bin/grep [0-9]:global) ]
1263 then
1264 # SCMSGS
1265 # @explanation
1266 # The node is being evecuated.
1267 # @user_action
1268 # None required. The domain will not be migrated or live
1269 # migrated. Instead, a normal failover will be performed.
1270 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1271 "Node is being evacuated, normal failover will be performed."
1272
1273 else
1274 for i in $(${SCHA_RESOURCEGROUP_GET} -O NODELIST -G ${RESOURCEGROUP})
1275 do
1276 [[ "${i}" != "$(uname -n)" || "${i}" = "${TARGET_HOST}" ]] && rc=0 && break
1277 done
1278
1279 if [ "${rc}" -eq 0 ]
1280 then
1281 PRIVATELINK_TARGET_HOST=$(${SCHA_CLUSTER_GET} -O PRIVATELINK_HOSTNAME_NODE ${TARGET_HOST})
1282 debug_message "get_target_host - PRIVATELINK_TARGET_HOST=${PRIVATELINK_TARGET_HOST}"
1283 else
1284 # SCMSGS
1285 # @explanation
1286 # The target host found in the command log file is not
1287 # a valid entry within the resource groups nodelist.
1288 # @user_action
1289 # None required. The domain will not be migrated or live
1290 # migrated instead a normal failover will be performed.
1291 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1292 "Target host %s not matched with the resource group nodelist, normal failover will be performed." \
1293 "${TARGET_HOST}"
1294 fi
1295 fi
1296
1297 debug_message "Function: get_target_host - End"
1298 return ${rc}
1299 }
1300
1301 check_commandlog()
1302 {
1303 debug_message "Function: check_commandlog - Begin"
1304
1305 # Get the current epoch time
1306 typeset ETIME=$(/usr/bin/perl -e 'print time;')
1307 typeset DATE=$(/usr/bin/date '+%m/%d/%Y')
1308 i=10
1309
1310 while (( $i > 0 ))
1311 do
1312 # Iteratively search the commandlog for a switch or evacuate, going back in time
1313 # by one second each time. If a match is found we break out of the loop.
1314 #
1315 # The following may help to understand the iterative loop.
1316 #
1317 # bash-3.2# ETIME=$(perl -e 'print time;')
1318 # bash-3.2# echo $ETIME
1319 # 1202814041
1320 # bash-3.2# HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | awk '{print $4}')
1321 # bash-3.2# echo $HHMMSS
1322 # 03:00:41
1323 # bash-3.2# ETIME=$(expr ${ETIME} - 1)
1324 # bash-3.2# echo $ETIME
1325 # 1202814040
1326 # bash-3.2# HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | awk '{print $4}')
1327 # bash-3.2# echo $HHMMSS
1328 # 03:00:40
1329 # bash-3.2#
1330
1331 # Convert the epoch time into a readable format
1332 HHMMSS=$(echo "0t${ETIME}=Y" | /usr/bin/mdb | ${AWK} '{print $4}')
1333
1334 debug_message "check_commadlog - performed for ${DATE} ${HHMMSS}"
1335
1336 # Check for a clrg switch or scswitch
1337 TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
1338 /usr/bin/grep -w START | /usr/bin/grep switch | /usr/bin/grep \"${RESOURCEGROUP}\" |\
1339 /usr/bin/sed -e 's/^.*-h //' -e 's/^.*-n //' | ${AWK} '{print $1}' | ${TR} -d '" ')
1340
1341 [ "${#TARGET_HOST}" -ne 0 ] && break
1342
1343 # Check for a clrg evacuate
1344 TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
1345 /usr/bin/grep -w START | /usr/bin/grep evacuate |\
1346 /usr/bin/sed -e 's/^.*-n //' | ${AWK} '{print $1}' | ${TR} -d '+" ' )
1347
1348 [ "${#TARGET_HOST}" -ne 0 ] && break
1349
1350 # Check for a scswitch -S
1351 TARGET_HOST=$(/usr/bin/grep "${DATE} ${HHMMSS}" /var/cluster/logs/commandlog |\
1352 /usr/bin/grep -w START | /usr/bin/grep scswitch | /usr/bin/grep "\-S" |\
1353 /usr/bin/sed -e 's/^.*-h //' | ${AWK} '{print $1}' | ${TR} -d '\-SK" ' )
1354
1355 [ "${#TARGET_HOST}" -ne 0 ] && break
1356
1357 i=$(expr $i - 1)
1358 ETIME=$(expr ${ETIME} - 1)
1359 done
1360
1361 debug_message "check_commandlog - TARGET_HOST=${TARGET_HOST}"
1362
1363 debug_message "Function: check_commandlog - End"
1364 }
1365
1366 #
1367 # routines to perform domain migration
1368 #
1369 migrate_xvm()
1370 {
1371 debug_message "Function: migrate_xvm - Begin"
1372 ${SET_DEBUG}
1373
1374 typeset rc=0
1375
1376 [[ "${MIGRATION_TYPE}" = "MIGRATE" ]] && OPTION="migrate"
1377 [[ "${MIGRATION_TYPE}" = "MIGRATE_LIVE" ]] && OPTION="migrate --live"
1378
1379 debug_message "domain_migrate - Running /usr/sbin/xm ${OPTION} ${DOMAIN} ${PRIVATELINK_TARGET_HOST}"
1380
1381 ${HATIMERUN} -t ${MAX_MIGRATE_TIMEOUT} -k KILL \
1382 ${XM} ${MIGRATION_TYPE} "${DOMAIN}" ${PRIVATELINK_TARGET_HOST} > /dev/null 2>&1
1383 rc=${?}
1384
1385 debug_message "Function: migrate_xvm - End"
1386 return ${rc}
1387 }
1388
1389 migrate_ldom()
1390 {
1391 debug_message "Function: migrate_ldom - Begin"
1392 ${SET_DEBUG}
1393
1394 typeset rc=0
1395
1396 [[ "${MIGRATION_TYPE}" = "MIGRATE" ]] && OPTION="migrate"
1397
1398 debug_message "domain_migrate - Running /opt/SUNWscxvm/bin/ldm_migrate ${OPTION} ${DOMAIN} ${PRIVATELINK_TARGET_HOST}"
1399
1400 ${HATIMERUN} -t ${MAX_MIGRATE_TIMEOUT} -k KILL \
1401 /opt/SUNWscxvm/bin/ldm_migrate ${OPTION} "${DOMAIN}" ${PRIVATELINK_TARGET_HOST} ${PASSWORD_FILE} >> $LOGFILE 2>&1
1402 rc=${?}
1403
1404 debug_message "Function: migrate_ldom - End"
1405 return ${rc}
1406 }
1407
1408 #
1409 # routines to cancel migration
1410 #
1411 cancel_xvm_migration()
1412 {
1413 # NO OP for a xvm domain
1414 return 0
1415 }
1416
1417 cancel_ldom_migration()
1418 {
1419 debug_message "Function: cancel_ldom_migration - Begin"
1420 ${SET_DEBUG}
1421
1422 # cancel domain migration for ldoms
1423 if ${LDM} cancel-operation migration ${DOMAIN} >> $LOGFILE 2>&1
1424 then
1425 # SCMSGS
1426 # @explanation
1427 # The domain migration operation was cancelled.
1428 # @user_action
1429 # None required. Informational message.
1430 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1431 "Migration of domain %s is cancelled, the domain state is now in active state." \
1432 "${DOMAIN}"
1433 fi
1434
1435 while (( ${SECONDS} < ${MAX_STOP_TIMEOUT} ))
1436 do
1437 if get_${VM}_status | ${GREP} -q -E "^suspending|^resuming|^suspended|^starting" > /dev/null 2>&1
1438 then
1439 sleep 5
1440 else
1441 SECONDS=${MAX_STOP_TIMEOUT}
1442 fi
1443 done
1444
1445 debug_message "Function: cancel_ldom_migration - End"
1446 }
1447
1448 domain_migrate()
1449 {
1450 debug_message "Function: domain_migrate - Begin"
1451 ${SET_DEBUG}
1452
1453 typeset rc
1454
1455 [[ "${MIGRATION_TYPE}" = "MIGRATE" ]] && MSG="migrated"
1456 [[ "${MIGRATION_TYPE}" = "MIGRATE_LIVE" ]] && MSG="live migrated"
1457
1458 if get_target_host
1459 then
1460 # SCMSGS
1461 # @explanation
1462 # The domain is being migrated or live migrated to the target host.
1463 # @user_action
1464 # None required.
1465 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1466 "Domain %s is being %s to %s." \
1467 "${DOMAIN}" "${MSG}" "${TARGET_HOST}"
1468
1469 migrate_${VM} ${MIGRATION_TYPE} ${DOMAIN} ${PRIVATELINK_TARGET_HOST}
1470 rc=${?}
1471
1472 if (( ${rc} == 0 ))
1473 then
1474 # SCMSGS
1475 # @explanation
1476 # The domain was migrated or live migrated to the target host.
1477 # @user_action
1478 # None required. The domain successfully migrated or live migrated
1479 # from the source node to the target node.
1480 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1481 "Domain %s successfully %s to %s." \
1482 "${DOMAIN}" "${MSG}" "${TARGET_HOST}"
1483
1484 # As the domain has been successfully migrated or live migrated
1485 # we need to indicate a successful stop by performing a NO-OP stop
1486 # and subsequently a successful start by performing a NO-OP start.
1487
1488 if ${CCRADM} addkey --key=noop_${RESOURCE} --value="1" ${CCR_TABLE} >> $LOGFILE 2>&1
1489 then
1490 debug_message "domain_migrate - .noop_${RESOURCE} flag added to CCR"
1491 else
1492 # SCMSGS
1493 # @explanation
1494 # Failed to update the XMl configuration to the CCR.
1495 # @user_action
1496 # Check the syslog for further messages.
1497 # Determine why the ccr update failed.
1498 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1499 "Failed to add NO-OP flag for %s to ccr." \
1500 "${DOMAIN}"
1501 rc=1
1502 fi
1503
1504 # SCMSGS
1505 # @explanation
1506 # The domain was migrated or live migrated.
1507 # @user_action
1508 # None required. Informational message.
1509 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1510 "NO-OP STOP being performed."
1511
1512 elif (( ${rc} == 99 ))
1513 then
1514 # SCMSGS
1515 # @explanation
1516 # The domain migration or live migration timed out.
1517 # @user_action
1518 # None required. Informational message.
1519 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1520 "Migration of domain %s timed out, the domain state is now shut off." \
1521 "${DOMAIN}"
1522
1523 rc=1
1524 cancel_${VM}_migration
1525 else
1526 # SCMSGS
1527 # @explanation
1528 # The domain failed to migrate or live migrate to the target host.
1529 # @user_action
1530 # None required. The domain failed to migrate or live migrate
1531 # from the source node to the target node. A normal failover
1532 # will be performed.
1533 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1534 "Domain %s failed to %s to %s, normal failover will be performed." \
1535 "${DOMAIN}" "${MSG}" "${TARGET_HOST}"
1536
1537 rc=1
1538 cancel_${VM}_migration
1539 fi
1540 else
1541 rc=1
1542 fi
1543
1544 # If the domain has successfully migrated, we will now delete the domain.
1545 #
1546 # Doing this ensures that the domain is only defined and able to be started
1547 # on one cluster node at a time. Domains can use shared storage between cluster
1548 # nodes so it is very important that we prevent any data corruption if a domain
1549 # gets manually started on multiple cluster nodes where shared storage is used.
1550 #
1551 # Of course using SUNW.HAStoragePlus somewhat protects against this, however we
1552 # simply want to avoid any manual administrative errors performed by mistake.
1553 #
1554 # Note, unless the domain was migrated or live migrated, the domain is defined
1555 # before startup using a previously dumped XML file for the administrative file
1556 # system.
1557
1558 (( ${rc} == 0 )) && [[ "${VM}" == "xvm" ]] && domain_delete
1559
1560 debug_message "Function: domain_migrate - End"
1561 return ${rc}
1562 }
1563
1564 #
1565 # routines to perform domain shutdown
1566 #
1567 shutdown_xvm()
1568 {
1569 debug_message "Function: shutdown_xvm - Begin"
1570 ${SET_DEBUG}
1571
1572 typeset rc=0
1573
1574 # Note that the virsh shutdown command returns before the domain
1575 # has shutdown, as such we do not use hatimerun.
1576
1577 ${VIRSH} shutdown ${DOMAIN} > /dev/null 2>&1
1578 rc=${?}
1579
1580 debug_message "Function: shutdown_xvm - End"
1581 return ${rc}
1582 }
1583
1584 shutdown_ldom()
1585 {
1586 debug_message "Function: shutdown_ldom - Begin"
1587 ${SET_DEBUG}
1588
1589 typeset rc
1590
1591 status=$(get_${VM}_status)
1592 if (( ${?} == 0 ))
1593 then
1594 if echo ${status} | ${GREP} -q -E "^active$|^suspending|^resuming|^suspended|^starting" > /dev/null 2>&1
1595 then
1596 ${HATIMERUN} -t ${MAX_STOP_TIMEOUT} -k KILL ${LDM} stop-domain ${DOMAIN} >> $LOGFILE 2>&1
1597 rc=${?}
1598 else
1599 # domain is already stopped
1600 rc=0
1601 fi
1602 else
1603 # domain is not present.
1604 rc=2
1605 fi
1606
1607 debug_message "Function: shutdown_ldom - Begin"
1608 return ${rc}
1609 }
1610
1611 domain_shutdown()
1612 {
1613 debug_message "Function: domain_shutdown - Begin"
1614 ${SET_DEBUG}
1615
1616 typeset rc
1617
1618 # Corordinate with the domain OS to perform a graceful shutdown.
1619 # Note that the virsh shutdown command returns before the domain
1620 # has shutdown, as such we do not use hatimerun.
1621
1622 shutdown_${VM}
1623 rc=${?}
1624 if (( ${rc} == 2 ))
1625 then
1626 debug_message "Function: domain_shutdown - End"
1627 return 0
1628 elif (( ${rc} == 0 ))
1629 then
1630 # Loop to test if the domain shuts down gracefully
1631 # or if the shutdown time is exceeded.
1632
1633 while (( ${SECONDS} < ${MAX_STOP_TIMEOUT} ))
1634 do
1635 if is_${VM}_up
1636 then
1637 sleep 5
1638 else
1639 SECONDS=${MAX_STOP_TIMEOUT}
1640 fi
1641 done
1642
1643 if is_${VM}_up
1644 then
1645 # SCMSGS
1646 # @explanation
1647 # The domain failed to shutdown gracefully.
1648 # @user_action
1649 # None required. The domain failed to shutdown
1650 # gracefully and will now be immediately terminated.
1651 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1652 "Domain %s failed to shutdown gracefully, immediate shutdown will now be performed." \
1653 "${DOMAIN}"
1654
1655 destroy_${VM}
1656 rc=${?}
1657 else
1658 # SCMSGS
1659 # @explanation
1660 # The domain was shutdown gracefully.
1661 # @user_action
1662 # None required. The domain has shutdown gracefully.
1663 scds_syslog -p daemon.info -t $(syslog_tag) -m \
1664 "Domain %s has been gracefully shutdown." \
1665 "${DOMAIN}"
1666 rc=0
1667 fi
1668
1669 else
1670 # error already logged
1671 destroy_${VM}
1672 rc=${?}
1673 fi
1674
1675 # If the domain has successfully shutdown, we will now delete the domain.
1676 #
1677 # Doing this ensures that the domain is only defined and able to be started
1678 # on one cluster node at a time. Domains can use shared storage between cluster
1679 # nodes so it is very important that we prevent any data corruption if a domain
1680 # gets manually started on multiple cluster nodes where shared storage is used.
1681 #
1682 # Of course using SUNW.HAStoragePlus somewhat protects against this, however we
1683 # simply want to avoid any manual administrative errors performed by mistake.
1684 #
1685 # Note, unless the domain was migrated or live migrated, the domain is defined
1686 # before startup using a previously dumped XML file for the administrative file
1687 # system.
1688
1689 (( ${rc} == 0 )) && domain_delete
1690
1691 debug_message "Function: domain_shutdown - End"
1692 return ${rc}
1693 }
1694
1695 #
1696 # routines to destroy domain
1697 #
1698 destroy_xvm()
1699 {
1700 debug_message "Function: destroy_xvm - Begin"
1701 ${SET_DEBUG}
1702
1703 typeset rc
1704
1705 if ${VIRSH} destroy ${DOMAIN} >> $LOGFILE 2>&1
1706 then
1707 # SCMSGS
1708 # @explanation
1709 # The domain was immediately terminated.
1710 # @user_action
1711 # None required. The domain had previously failed to shutdown
1712 # gracefully but has now been immediately terminated.
1713 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1714 "Domain %s has been immediately terminated." \
1715 "${DOMAIN}"
1716 rc=0
1717 else
1718 # SCMSGS
1719 # @explanation
1720 # The /usr/bin/virsh destroy command failed.
1721 # @user_action
1722 # Determine why it was not possible to immediately terminate
1723 # the domain.
1724 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1725 "Domain %s failed to shutdown immediately." \
1726 "${DOMAIN}"
1727 rc=1
1728 fi
1729
1730 debug_message "Function: destroy_xvm - End"
1731 return ${rc}
1732 }
1733
1734 destroy_ldom()
1735 {
1736 debug_message "Function: destroy_ldom - Begin"
1737 ${SET_DEBUG}
1738
1739 typeset rc
1740
1741 if ${LDM} stop-domain -f ${DOMAIN} >> $LOGFILE 2>&1
1742 then
1743 # SCMSGS
1744 # @explanation
1745 # The domain was immediately terminated.
1746 # @user_action
1747 # None required. The domain had previously failed to shutdown
1748 # gracefully but has now been immediately terminated.
1749 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1750 "Domain %s has been forcefully terminated." \
1751 "${DOMAIN}"
1752 rc=0
1753 else
1754 # SCMSGS
1755 # @explanation
1756 # The /opt/SUNWldm/bin/ldm stop-domain "-f" command failed.
1757 # @user_action
1758 # Determine why it was not possible to forcefully stop
1759 # the domain.
1760 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1761 "Domain %s failed to do a forceful shutdown." \
1762 "${DOMAIN}"
1763 rc=1
1764 fi
1765
1766 debug_message "Function: destroy_ldom - End"
1767 return ${rc}
1768 }
1769
1770 #
1771 # routines to remove domains from the node
1772 #
1773 domain_delete()
1774 {
1775 debug_message "Function: domain_delete - Begin"
1776 ${SET_DEBUG}
1777
1778 # The purpose of deleting the domain after shutdown is to avoid the possibility of
1779 # someone manually starting the domain on a different node. Doing so would compromise
1780 # the domain if shared storage was used for the domain. The domain's configuration
1781 # is always dumped to the agent's administrative file system so that the domain can
1782 # be defined before startup.
1783
1784 typeset rc
1785
1786 if delete_${VM}
1787 then
1788 # SCMSGS
1789 # @explanation
1790 # The domain was deleted.
1791 # @user_action
1792 # None required. The domain has been deleted as it
1793 # will be defined on another node. Deleting the domain
1794 # on this node ensures that it can't be started on
1795 # more than one cluster node at a time.
1796 scds_syslog -p daemon.notice -t $(syslog_tag) -m \
1797 "Domain %s has been deleted on this node." \
1798 "${DOMAIN}"
1799 rc=0
1800 else
1801 # error already logged.
1802 rc=1
1803 fi
1804
1805 debug_message "Function: domain_delete - End"
1806 return ${rc}
1807 }
1808
1809 delete_xvm()
1810 {
1811 debug_message "Function: delete_xvm - Begin"
1812 ${SET_DEBUG}
1813
1814 typeset rc=0
1815
1816 if ! /usr/sbin/xm delete ${DOMAIN} >> $LOGFILE 2>&1
1817 then
1818 # SCMSGS
1819 # @explanation
1820 # The /usr/sbin/xm delete command failed.
1821 # @user_action
1822 # Determine why it was not possible to delete the domain.
1823 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1824 "Failed to delete domain %s on this node." \
1825 "${DOMAIN}"
1826 rc=1
1827 fi
1828
1829 debug_message "Function: delete_xvm - End"
1830 return ${rc}
1831 }
1832
1833 delete_ldom()
1834 {
1835 debug_message "Function: delete_ldom - Begin"
1836 ${SET_DEBUG}
1837
1838 if get_${VM}_status | ${GREP} -q -E "^bound$" > /dev/null 2>&1
1839 then
1840
1841 # if the domain is in bound state, unbind it.
1842 ${LDM} unbind-domain ${DOMAIN} >> $LOGFILE 2>&1
1843
1844 if (( ${?} != 0 ))
1845 then
1846 # SCMSGS
1847 # @explanation
1848 # The /opt/SUNWldm/bin/ldm unbind-domain command failed.
1849 # @user_action
1850 # Determine why it was not possible to unbind the domain.
1851 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1852 "Failed to unbind domain %s on this node." \
1853 "${DOMAIN}"
1854
1855 debug_message "Function: delete_ldom - End"
1856 return 1
1857 fi
1858 fi
1859
1860 if ! ${LDM} remove-domain ${DOMAIN} >> $LOGFILE 2>&1
1861 then
1862 # SCMSGS
1863 # @explanation
1864 # The /opt/SUNWldm/bin/ldm remove-domain command failed.
1865 # @user_action
1866 # Determine why it was not possible to remove the domain.
1867 scds_syslog -p daemon.error -t $(syslog_tag) -m \
1868 "Failed to remove domain %s on this node." \
1869 "${DOMAIN}"
1870
1871 debug_message "Function: delete_ldom - End"
1872 return 1
1873 fi
1874
1875 debug_message "Function: delete_ldom - End"
1876 return 0
1877 }