ab-stats/merge-data.py

#!/usr/bin/python
import os, sys, csv, glob, re

# check if we got the dirname
if len(sys.argv) < 2:
  sys.exit("Please provide the directory to work in.")

# check if exists
if not os.path.exists(sys.argv[1]):
  sys.exit("The path '%s' does not seem to exist." % sys.argv[1])

if not os.path.isdir(sys.argv[1]):
  sys.exit("The path '%s' does not seem to be a directory." % sys.argv[1])

if sys.argv[1][-1] != '/':
  sys.argv[1] += '/'

# info
print "Working in '%s'" % sys.argv[1]

# writing the data out to a csv file
def write_data(f, data):
  with open('%s%s' % (sys.argv[1], f), 'wb') as csvfile:
    csvw = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for row in data:
      csvw.writerow(row)


# handling the partial sums/averages
def handle_partsum(data, gcol, gn=None):

  # if the gcol is the current column (or does not exist yet) don't do anything, obviously
  if not data or gcol >= len(data[0]):
    #print 'handle_partsum : gcol >= len(data[0]) : %s >= %s' % (gcol, len(data[0]))
    return False;

  # indexing the data
  i = 0

  # add groupname headers
  # only used on the CSV part of this script
  if data and gn:
    data[i].append(gn)
    i += 1

  # 'average' header, used everywhere
  data[i].append('average')
  i += 1

  # let's iterate through partsum
  for i in range(i, len(data)):

    # the data to be averaged
    partsum = data[i][gcol:]
    for j in range(len(partsum)):
      partsum[j] = float(partsum[j])

    # pretty straightforward -- the average is "0" if there are less than 3 data points...
    if len(partsum) < 3:
      data[i].append(0)
    # ...otherwise it's the average of the datapoints (with minimum and maximum exempt)
    else:
      data[i].append((sum(partsum) - min(partsum) - max(partsum)) / (len(partsum) - 2))

#
# handling the *.csv files generated by ab
#

print " - handing CSV files"

# get the files
sources = glob.glob('%s*[0-9].csv' % sys.argv[1])
sources.sort()

if not len(sources):
  sys.exit("No files matching the required '*[0-9].csv' globbing pattern found in '%s'." % sys.argv[1])

merged = []


# iterate the source files
gn = ''
gcol = 1
for f in sources:

  # name of the current group
  ngn = re.sub(r'(.*\/)?(.+)-[0-9]\.csv', r'\2', f)
  # has the group changed?
  if (gn != ngn):
    # handle partsum
    handle_partsum(merged, gcol, gn)
    # ah, it has! use the new group name
    gn = ngn
    # which is the active gcol?
    if merged:
      gcol = len(merged[0])


  # open the source file
  with open(f, 'rb') as csvfile:

    # if merged is empty
    if not merged:
      # headline
      merged.append(['Legend', gn])
      # iterate the rows
      for row in csv.reader(csvfile, delimiter=','):
        # and add whole rows to merged
        merged.append(row)

    # ah, so merged is not empty and already contains the first column
    # no need for the non-data column then, eh?
    else:
      i = 0
      merged[i].append(gn)
      # iterate the rows
      for row in csv.reader(csvfile, delimiter=','):
        i+=1
        merged[i].append(row[1])

# handle "dangling partsum"
handle_partsum(merged, gcol, gn)

print " - writing the result to '%smerged.csv'" % sys.argv[1]

# write it down
'''with open('%smerged.csv' % sys.argv[1], 'wb') as csvfile:
  csvw = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
  for row in merged:
    csvw.writerow(row)'''
write_data('merged.csv', merged)

#
# *.log files need to be handled differently
#

print " - handing LOG files"

# get the files
sources = glob.glob('%s*[0-9].log' % sys.argv[1])
sources.sort()

if not len(sources):
  sys.exit("No files matching the required '*[0-9].log' globbing pattern found in '%s'." % sys.argv[1])

# the fields
meta = [
['Server Software'],
['Server Hostname'],
['Server Port'],
['Document Path'],
['Document Length'],
['Concurrency Level']
]
merged = [
['Legend'],
['Time taken for tests'],
['Complete requests'],
['Failed requests'],
['Write errors'],
['Non-2xx responses'],
['Total transferred'],
['HTML transferred'],
['Requests per second'],
['Time per request'],
['Time per request'],
['Transfer rate']
]

# first, handle metadata
# from the first file only, that's entirely enough
i = 0
with open(sources[0], 'rt') as f:
  for l in f:
    if l[:len(meta[i][0])] == meta[i][0]:
      meta[i].append(re.sub(r'%s: +(.*)[\n\r]*' % meta[i][0], r'\1', l))
      i+=1
    if i >= len(meta):
      break

# we have the metadata, let's write these to a file!
print " - writing metadata to '%smeta.csv'" % sys.argv[1]
write_data('meta.csv', meta)

# now, handle statistical data
# each source file please!
gn = ''
gcol = 1
for s in sources:

  # group name
  # name of the current group
  ngn = re.sub(r'(.*\/)?(.+)-[0-9]\.log', r'\2', s)
  # has the group changed?
  if (gn != ngn):
    # handle partsum
    handle_partsum(merged, gcol)
    # ah, it has! use the new group name
    gn = ngn
    # the "group column" (the first column in a group)
    gcol = len(merged[0])

  # add the header
  merged[0].append(gn)

  # index of the merged list, skipping the first row (the header)
  i = 1

  # on with it!
  with open(s, 'rt') as f:
    for l in f:
      if l[:len(merged[i][0])] == merged[i][0]:
        merged[i].append(re.sub(r'%s: +([0-9,\.]+).*[\n\r]*' % merged[i][0], r'\1', l))
        i+=1
      if i >= len(merged):
        break

# handle dangling gcol
handle_partsum(merged, gcol)


# we have the stats, let's write these to a file!
print " - writing the result to '%smerged-logs.csv'" % sys.argv[1]
write_data('merged-logs.csv', merged)