489 lines
16 KiB
Python
489 lines
16 KiB
Python
|
#!/usr/bin/env python3
|
||
|
|
||
|
import argparse
|
||
|
import os, atexit
|
||
|
import textwrap
|
||
|
import time
|
||
|
import tempfile
|
||
|
import threading, subprocess
|
||
|
import barrier, finishedSignal
|
||
|
|
||
|
|
||
|
import signal
|
||
|
import random
|
||
|
import time
|
||
|
from enum import Enum
|
||
|
|
||
|
from collections import defaultdict, OrderedDict
|
||
|
|
||
|
|
||
|
BARRIER_IP = 'localhost'
|
||
|
BARRIER_PORT = 10000
|
||
|
|
||
|
SIGNAL_IP = 'localhost'
|
||
|
SIGNAL_PORT = 11000
|
||
|
|
||
|
PROCESSES_BASE_IP = 11000
|
||
|
|
||
|
# Do not run multiple validations concurrently!
|
||
|
|
||
|
class TC:
|
||
|
def __init__(self, losses, interface="lo", needSudo=True, sudoPassword=""):
|
||
|
self.losses = losses
|
||
|
self.interface = interface
|
||
|
self.needSudo = needSudo
|
||
|
self.sudoPassword = sudoPassword
|
||
|
|
||
|
cmd1 = 'tc qdisc add dev {} root netem 2>/dev/null'.format(self.interface)
|
||
|
cmd2 = 'tc qdisc change dev {} root netem delay {} {} loss {} {} reorder {} {}'.format(self.interface, *self.losses['delay'], *self.losses['loss'], *self.losses['reordering'])
|
||
|
|
||
|
if self.needSudo:
|
||
|
os.system('echo %s | sudo -S {}'.format(cmd1))
|
||
|
os.system('echo %s | sudo -S {}'.format(cmd2))
|
||
|
else:
|
||
|
os.system(cmd1)
|
||
|
os.system(cmd2)
|
||
|
|
||
|
atexit.register(self.cleanup)
|
||
|
|
||
|
def __str__(self):
|
||
|
ret = """\
|
||
|
Interface: {}
|
||
|
Delay: {} {}
|
||
|
Loss: {} {}
|
||
|
Reordering: {} {}""".format(
|
||
|
self.interface,
|
||
|
*self.losses['delay'],
|
||
|
*self.losses['loss'],
|
||
|
*self.losses['reordering'])
|
||
|
|
||
|
return textwrap.dedent(ret)
|
||
|
|
||
|
def cleanup(self):
|
||
|
cmd = 'tc qdisc del dev {} root 2>/dev/null'.format(self.interface)
|
||
|
if self.needSudo:
|
||
|
os.system('echo %s | sudo -S {}'.format(cmd))
|
||
|
else:
|
||
|
os.system(cmd)
|
||
|
|
||
|
class ProcessState(Enum):
|
||
|
RUNNING = 1
|
||
|
STOPPED = 2
|
||
|
TERMINATED = 3
|
||
|
|
||
|
class ProcessInfo:
|
||
|
def __init__(self, handle):
|
||
|
self.lock = threading.Lock()
|
||
|
self.handle = handle
|
||
|
self.state = ProcessState.RUNNING
|
||
|
|
||
|
@staticmethod
|
||
|
def stateToSignal(state):
|
||
|
if state == ProcessState.RUNNING:
|
||
|
return signal.SIGCONT
|
||
|
|
||
|
if state == ProcessState.STOPPED:
|
||
|
return signal.SIGSTOP
|
||
|
|
||
|
if state == ProcessState.TERMINATED:
|
||
|
return signal.SIGTERM
|
||
|
|
||
|
@staticmethod
|
||
|
def stateToSignalStr(state):
|
||
|
if state == ProcessState.RUNNING:
|
||
|
return "SIGCONT"
|
||
|
|
||
|
if state == ProcessState.STOPPED:
|
||
|
return "SIGSTOP"
|
||
|
|
||
|
if state == ProcessState.TERMINATED:
|
||
|
return "SIGTERM"
|
||
|
|
||
|
@staticmethod
|
||
|
def validStateTransition(current, desired):
|
||
|
if current == ProcessState.TERMINATED:
|
||
|
return False
|
||
|
|
||
|
if current == ProcessState.RUNNING:
|
||
|
return desired == ProcessState.STOPPED or desired == ProcessState.TERMINATED
|
||
|
|
||
|
if current == ProcessState.STOPPED:
|
||
|
return desired == ProcessState.RUNNING
|
||
|
|
||
|
return False
|
||
|
|
||
|
class AtomicSaturatedCounter:
|
||
|
def __init__(self, saturation, initial=0):
|
||
|
self._saturation = saturation
|
||
|
self._value = initial
|
||
|
self._lock = threading.Lock()
|
||
|
|
||
|
def reserve(self):
|
||
|
with self._lock:
|
||
|
if self._value < self._saturation:
|
||
|
self._value += 1
|
||
|
return True
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
class Validation:
|
||
|
def __init__(self, processes, messages, outputDir):
|
||
|
self.processes = processes
|
||
|
self.messages = messages
|
||
|
self.outputDirPath = os.path.abspath(outputDir)
|
||
|
if not os.path.isdir(self.outputDirPath):
|
||
|
raise Exception("`{}` is not a directory".format(self.outputDirPath))
|
||
|
|
||
|
def generateConfig(self):
|
||
|
# Implement on the derived classes
|
||
|
pass
|
||
|
|
||
|
def checkProcess(self, pid):
|
||
|
# Implement on the derived classes
|
||
|
pass
|
||
|
|
||
|
def checkAll(self, continueOnError=True):
|
||
|
ok = True
|
||
|
for pid in range(1, self.processes+1):
|
||
|
ret = self.checkProcess(pid)
|
||
|
if not ret:
|
||
|
ok = False
|
||
|
|
||
|
if not ret and not continueOnError:
|
||
|
return False
|
||
|
|
||
|
return ok
|
||
|
|
||
|
class FifoBroadcastValidation(Validation):
|
||
|
def generateConfig(self):
|
||
|
hosts = tempfile.NamedTemporaryFile(mode='w')
|
||
|
config = tempfile.NamedTemporaryFile(mode='w')
|
||
|
|
||
|
for i in range(1, self.processes + 1):
|
||
|
hosts.write("{} localhost {}\n".format(i, PROCESSES_BASE_IP+i))
|
||
|
|
||
|
hosts.flush()
|
||
|
|
||
|
config.write("{}\n".format(self.messages))
|
||
|
config.flush()
|
||
|
|
||
|
return (hosts, config)
|
||
|
|
||
|
def checkProcess(self, pid):
|
||
|
filePath = os.path.join(self.outputDirPath, 'proc{:02d}.output'.format(pid))
|
||
|
|
||
|
i = 1
|
||
|
nextMessage = defaultdict(lambda : 1)
|
||
|
filename = os.path.basename(filePath)
|
||
|
|
||
|
with open(filePath) as f:
|
||
|
for lineNumber, line in enumerate(f):
|
||
|
tokens = line.split()
|
||
|
|
||
|
# Check broadcast
|
||
|
if tokens[0] == 'b':
|
||
|
msg = int(tokens[1])
|
||
|
if msg != i:
|
||
|
print("File {}, Line {}: Messages broadcast out of order. Expected message {} but broadcast message {}".format(filename, lineNumber, i, msg))
|
||
|
return False
|
||
|
i += 1
|
||
|
|
||
|
# Check delivery
|
||
|
if tokens[0] == 'd':
|
||
|
sender = int(tokens[1])
|
||
|
msg = int(tokens[2])
|
||
|
if msg != nextMessage[sender]:
|
||
|
print("File {}, Line {}: Message delivered out of order. Expected message {}, but delivered message {}".format(filename, lineNumber, nextMessage[sender], msg))
|
||
|
return False
|
||
|
else:
|
||
|
nextMessage[sender] = msg + 1
|
||
|
|
||
|
return True
|
||
|
|
||
|
class LCausalBroadcastValidation(Validation):
|
||
|
def __init__(self, processes, outputDir, causalRelationships):
|
||
|
super().__init__(processes, outputDir)
|
||
|
|
||
|
def generateConfig(self):
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def checkProcess(self, pid):
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
class StressTest:
|
||
|
def __init__(self, procs, concurrency, attempts, attemptsRatio):
|
||
|
self.processes = len(procs)
|
||
|
self.processesInfo = dict()
|
||
|
for (logicalPID, handle) in procs:
|
||
|
self.processesInfo[logicalPID] = ProcessInfo(handle)
|
||
|
self.concurrency = concurrency
|
||
|
self.attempts = attempts
|
||
|
self.attemptsRatio = attemptsRatio
|
||
|
|
||
|
maxTerminatedProcesses = self.processes // 2 if self.processes % 2 == 1 else (self.processes - 1) // 2
|
||
|
self.terminatedProcs = AtomicSaturatedCounter(maxTerminatedProcesses)
|
||
|
|
||
|
def stress(self):
|
||
|
selectProc = list(range(1, self.processes+1))
|
||
|
random.shuffle(selectProc)
|
||
|
|
||
|
selectOp = [ProcessState.STOPPED] * int(1000 * self.attemptsRatio['STOP']) + \
|
||
|
[ProcessState.RUNNING] * int(1000 * self.attemptsRatio['CONT']) + \
|
||
|
[ProcessState.TERMINATED] * int(1000 * self.attemptsRatio['TERM'])
|
||
|
random.shuffle(selectOp)
|
||
|
|
||
|
successfulAttempts = 0
|
||
|
while successfulAttempts < self.attempts:
|
||
|
proc = random.choice(selectProc)
|
||
|
op = random.choice(selectOp)
|
||
|
info = self.processesInfo[proc]
|
||
|
|
||
|
with info.lock:
|
||
|
if ProcessInfo.validStateTransition(info.state, op):
|
||
|
|
||
|
if op == ProcessState.TERMINATED:
|
||
|
reserved = self.terminatedProcs.reserve()
|
||
|
if reserved:
|
||
|
selectProc.remove(proc)
|
||
|
else:
|
||
|
continue
|
||
|
|
||
|
time.sleep(float(random.randint(50, 500)) / 1000.0)
|
||
|
info.handle.send_signal(ProcessInfo.stateToSignal(op))
|
||
|
info.state = op
|
||
|
successfulAttempts += 1
|
||
|
print("Sending {} to process {}".format(ProcessInfo.stateToSignalStr(op), proc))
|
||
|
|
||
|
# if op == ProcessState.TERMINATED and proc not in terminatedProcs:
|
||
|
# if len(terminatedProcs) < maxTerminatedProcesses:
|
||
|
|
||
|
# terminatedProcs.add(proc)
|
||
|
|
||
|
# if len(terminatedProcs) == maxTerminatedProcesses:
|
||
|
# break
|
||
|
|
||
|
def remainingUnterminatedProcesses(self):
|
||
|
remaining = []
|
||
|
for pid, info in self.processesInfo.items():
|
||
|
with info.lock:
|
||
|
if info.state != ProcessState.TERMINATED:
|
||
|
remaining.append(pid)
|
||
|
|
||
|
return None if len(remaining) == 0 else remaining
|
||
|
|
||
|
def terminateAllProcesses(self):
|
||
|
for _, info in self.processesInfo.items():
|
||
|
with info.lock:
|
||
|
if info.state != ProcessState.TERMINATED:
|
||
|
if info.state == ProcessState.STOPPED:
|
||
|
info.handle.send_signal(ProcessInfo.stateToSignal(ProcessState.RUNNING))
|
||
|
|
||
|
info.handle.send_signal(ProcessInfo.stateToSignal(ProcessState.TERMINATED))
|
||
|
|
||
|
return False
|
||
|
|
||
|
def continueStoppedProcesses(self):
|
||
|
for _, info in self.processesInfo.items():
|
||
|
with info.lock:
|
||
|
if info.state != ProcessState.TERMINATED:
|
||
|
if info.state == ProcessState.STOPPED:
|
||
|
info.handle.send_signal(ProcessInfo.stateToSignal(ProcessState.RUNNING))
|
||
|
|
||
|
def run(self):
|
||
|
if self.concurrency > 1:
|
||
|
threads = [threading.Thread(target=self.stress) for _ in range(self.concurrency)]
|
||
|
[p.start() for p in threads]
|
||
|
[p.join() for p in threads]
|
||
|
else:
|
||
|
self.stress()
|
||
|
|
||
|
def startProcesses(processes, runscript, hostsFilePath, configFilePath, outputDir):
|
||
|
runscriptPath = os.path.abspath(runscript)
|
||
|
if not os.path.isfile(runscriptPath):
|
||
|
raise Exception("`{}` is not a file".format(runscriptPath))
|
||
|
|
||
|
if os.path.basename(runscriptPath) != 'run.sh':
|
||
|
raise Exception("`{}` is not a runscript".format(runscriptPath))
|
||
|
|
||
|
outputDirPath = os.path.abspath(outputDir)
|
||
|
if not os.path.isdir(outputDirPath):
|
||
|
raise Exception("`{}` is not a directory".format(outputDirPath))
|
||
|
|
||
|
baseDir, _ = os.path.split(runscriptPath)
|
||
|
bin_cpp = os.path.join(baseDir, "bin", "da_proc")
|
||
|
bin_java = os.path.join(baseDir, "bin", "da_proc.jar")
|
||
|
|
||
|
if os.path.exists(bin_cpp):
|
||
|
cmd = [bin_cpp]
|
||
|
elif os.path.exists(bin_java):
|
||
|
cmd = ['java', '-jar', bin_java]
|
||
|
else:
|
||
|
raise Exception("`{}` could not find a binary to execute. Make sure you build before validating".format(runscriptPath))
|
||
|
|
||
|
procs = []
|
||
|
for pid in range(1, processes+1):
|
||
|
cmd_ext = ['--id', str(pid),
|
||
|
'--hosts', hostsFilePath,
|
||
|
'--barrier', '{}:{}'.format(BARRIER_IP, BARRIER_PORT),
|
||
|
'--signal', '{}:{}'.format(SIGNAL_IP, SIGNAL_PORT),
|
||
|
'--output', os.path.join(outputDirPath, 'proc{:02d}.output'.format(pid)),
|
||
|
configFilePath]
|
||
|
|
||
|
stdoutFd = open(os.path.join(outputDirPath, 'proc{:02d}.stdout'.format(pid)), "w")
|
||
|
stderrFd = open(os.path.join(outputDirPath, 'proc{:02d}.stderr'.format(pid)), "w")
|
||
|
|
||
|
|
||
|
procs.append((pid, subprocess.Popen(cmd + cmd_ext, stdout=stdoutFd, stderr=stderrFd)))
|
||
|
|
||
|
return procs
|
||
|
|
||
|
def main(processes, messages, runscript, broadcastType, logsDir, testConfig):
|
||
|
# Set tc for loopback
|
||
|
tc = TC(testConfig['TC'])
|
||
|
print(tc)
|
||
|
|
||
|
# Start the barrier
|
||
|
initBarrier = barrier.Barrier(BARRIER_IP, BARRIER_PORT, processes)
|
||
|
initBarrier.listen()
|
||
|
startTimesFuture = initBarrier.startTimesFuture()
|
||
|
|
||
|
initBarrierThread = threading.Thread(target=initBarrier.wait)
|
||
|
initBarrierThread.start()
|
||
|
|
||
|
# Start the finish signal
|
||
|
finishSignal = finishedSignal.FinishedSignal(SIGNAL_IP, SIGNAL_PORT, processes)
|
||
|
finishSignal.listen()
|
||
|
finishSignalThread = threading.Thread(target=finishSignal.wait)
|
||
|
finishSignalThread.start()
|
||
|
|
||
|
if broadcastType == "fifo":
|
||
|
validation = FifoBroadcastValidation(processes, messages, logsDir)
|
||
|
else:
|
||
|
validation = LCausalBroadcastValidation(processes, messages, logsDir, None)
|
||
|
|
||
|
hostsFile, configFile = validation.generateConfig()
|
||
|
|
||
|
try:
|
||
|
# Start the processes and get their PIDs
|
||
|
procs = startProcesses(processes, runscript, hostsFile.name, configFile.name, logsDir)
|
||
|
|
||
|
# Create the stress test
|
||
|
st = StressTest(procs,
|
||
|
testConfig['ST']['concurrency'],
|
||
|
testConfig['ST']['attempts'],
|
||
|
testConfig['ST']['attemptsDistribution'])
|
||
|
|
||
|
for (logicalPID, procHandle) in procs:
|
||
|
print("Process with logicalPID {} has PID {}".format(logicalPID, procHandle.pid))
|
||
|
|
||
|
|
||
|
initBarrierThread.join()
|
||
|
print("All processes have been initialized.")
|
||
|
|
||
|
st.run()
|
||
|
print("StressTest is complete.")
|
||
|
|
||
|
|
||
|
print("Resuming stopped processes.")
|
||
|
st.continueStoppedProcesses()
|
||
|
|
||
|
print("Waiting until all running processes have finished broadcasting.")
|
||
|
finishSignalThread.join()
|
||
|
|
||
|
for pid, startTs in OrderedDict(sorted(startTimesFuture.items())).items():
|
||
|
print("Process {} finished broadcasting {} messages in {} ms".format(pid, messages, finishSignal.endTimestamps()[pid] - startTs))
|
||
|
|
||
|
unterminated = st.remainingUnterminatedProcesses()
|
||
|
if unterminated is not None:
|
||
|
input('Hit `Enter` to terminate the remaining processes with logicalPIDs {}.'.format(unterminated))
|
||
|
st.terminateAllProcesses()
|
||
|
|
||
|
mutex = threading.Lock()
|
||
|
|
||
|
def waitForProcess(logicalPID, procHandle, mutex):
|
||
|
procHandle.wait()
|
||
|
|
||
|
with mutex:
|
||
|
print("Process {} exited with {}".format(logicalPID, procHandle.returncode))
|
||
|
|
||
|
# Monitor which processes have exited
|
||
|
monitors = [threading.Thread(target=waitForProcess, args=(logicalPID, procHandle, mutex)) for (logicalPID, procHandle) in procs]
|
||
|
[p.start() for p in monitors]
|
||
|
[p.join() for p in monitors]
|
||
|
|
||
|
print("Result of validation: {}".format(validation.checkAll()))
|
||
|
|
||
|
finally:
|
||
|
if procs is not None:
|
||
|
for _, p in procs:
|
||
|
p.kill()
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
parser = argparse.ArgumentParser()
|
||
|
|
||
|
parser.add_argument(
|
||
|
"-r",
|
||
|
"--runscript",
|
||
|
required=True,
|
||
|
dest="runscript",
|
||
|
help="Path to run.sh",
|
||
|
)
|
||
|
|
||
|
parser.add_argument(
|
||
|
"-b",
|
||
|
"--broadcast",
|
||
|
choices=["fifo", "lcausal"],
|
||
|
required=True,
|
||
|
dest="broadcastType",
|
||
|
help="Which broadcast implementation to test",
|
||
|
)
|
||
|
|
||
|
parser.add_argument(
|
||
|
"-l",
|
||
|
"--logs",
|
||
|
required=True,
|
||
|
dest="logsDir",
|
||
|
help="Directory to store stdout, stderr and outputs generated by the processes",
|
||
|
)
|
||
|
|
||
|
parser.add_argument(
|
||
|
"-p",
|
||
|
"--processes",
|
||
|
required=True,
|
||
|
type=int,
|
||
|
dest="processes",
|
||
|
help="Number of processes that broadcast",
|
||
|
)
|
||
|
|
||
|
parser.add_argument(
|
||
|
"-m",
|
||
|
"--messages",
|
||
|
required=True,
|
||
|
type=int,
|
||
|
dest="messages",
|
||
|
help="Maximum number (because it can crash) of messages that each process can broadcast",
|
||
|
)
|
||
|
|
||
|
results = parser.parse_args()
|
||
|
|
||
|
testConfig = {
|
||
|
# Network configuration using the tc command
|
||
|
'TC': {
|
||
|
'delay': ('200ms', '50ms'),
|
||
|
'loss': ('10%', '25%'),
|
||
|
'reordering': ('25%', '50%')
|
||
|
},
|
||
|
|
||
|
# StressTest configuration
|
||
|
'ST': {
|
||
|
'concurrency' : 8, # How many threads are interferring with the running processes
|
||
|
'attempts' : 8, # How many interferring attempts each threads does
|
||
|
'attemptsDistribution' : { # Probability with which an interferring thread will
|
||
|
'STOP': 0.48, # select an interferring action (make sure they add up to 1)
|
||
|
'CONT': 0.48,
|
||
|
'TERM':0.04
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
main(results.processes, results.messages, results.runscript, results.broadcastType, results.logsDir, testConfig)
|