ab-stats/merge-data.py

#!/usr/bin/python
import os, sys, csv, glob, re

# check if we got the dirname
if len(sys.argv) < 2:
  sys.exit("Please provide the directory to work in.")

# check if exists
if not os.path.exists(sys.argv[1]):
  sys.exit("The path '%s' does not seem to exist." % sys.argv[1])

if not os.path.isdir(sys.argv[1]):
  sys.exit("The path '%s' does not seem to be a directory." % sys.argv[1])

if sys.argv[1][-1] != '/':
  sys.argv[1] += '/'

# info
print "Working in '%s'" % sys.argv[1]

# writing the data out to a csv file
def write_data(f, data):
  with open('%s%s' % (sys.argv[1], f), 'wb') as csvfile:
    csvw = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for row in data:
      csvw.writerow(row)


# handling the partial sums/averages
def handle_partsum(data, gcol, gn=None):

  # if the gcol is the current column (or does not exist yet) don't do anything, obviously
  if not data or gcol >= len(data[0]):
    #print 'handle_partsum : gcol >= len(data[0]) : %s >= %s' % (gcol, len(data[0]))
    return False;

  # indexing the data
  i = 0

  # add groupname headers
  # only used on the CSV part of this script
  if data and gn:
    data[i].append(gn)
    i += 1

  # 'average' header, used everywhere
  data[i].append('average')
  i += 1

  # let's iterate through partsum
  for i in range(i, len(data)):

    # the data to be averaged
    partsum = data[i][gcol:]
    for j in range(len(partsum)):
      partsum[j] = float(partsum[j])

    # pretty straightforward -- the average is "0" if there are less than 3 data points...
    if len(partsum) < 3:
      data[i].append(0)
    # ...otherwise it's the average of the datapoints (with minimum and maximum exempt)
    else:
      data[i].append((sum(partsum) - min(partsum) - max(partsum)) / (len(partsum) - 2))

#
# handling the *.csv files generated by ab
#

print " - handing CSV files"

# get the files
sources = glob.glob('%s*[0-9].csv' % sys.argv[1])
sources.sort()

if not len(sources):
  sys.exit("No files matching the required '*[0-9].csv' globbing pattern found in '%s'." % sys.argv[1])

merged = []


# iterate the source files
gn = ''
gcol = 1
for f in sources:
  
  # name of the current group
  ngn = re.sub(r'(.*\/)?(.+)-[0-9]\.csv', r'\2', f)
  # has the group changed?
  if (gn != ngn):
    # handle partsum
    handle_partsum(merged, gcol, gn)
    # ah, it has! use the new group name
    gn = ngn
    # which is the active gcol?
    if merged:
      gcol = len(merged[0])

  
  # open the source file
  with open(f, 'rb') as csvfile:
    
    # if merged is empty
    if not merged:
      # headline
      merged.append(['Legend', gn])
      # iterate the rows
      for row in csv.reader(csvfile, delimiter=','):
        # and add whole rows to merged
        merged.append(row)

    # ah, so merged is not empty and already contains the first column
    # no need for the non-data column then, eh?
    else:
      i = 0
      merged[i].append(gn)
      # iterate the rows
      for row in csv.reader(csvfile, delimiter=','):
        i+=1
        merged[i].append(row[1])

# handle "dangling partsum"
handle_partsum(merged, gcol, gn)

print " - writing the result to '%smerged.csv'" % sys.argv[1]

# write it down
'''with open('%smerged.csv' % sys.argv[1], 'wb') as csvfile:
  csvw = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
  for row in merged:
    csvw.writerow(row)'''
write_data('merged.csv', merged)

#
# *.log files need to be handled differently
#

print " - handing LOG files"

# get the files
sources = glob.glob('%s*[0-9].log' % sys.argv[1])
sources.sort()

if not len(sources):
  sys.exit("No files matching the required '*[0-9].log' globbing pattern found in '%s'." % sys.argv[1])

# the fields
meta = [
['Server Software'],
['Server Hostname'],
['Server Port'],
['Document Path'],
['Document Length'],
['Concurrency Level']
]
merged = [
['Legend'],
['Time taken for tests'],
['Complete requests'],
['Failed requests'],
['Write errors'],
['Non-2xx responses'],
['Total transferred'],
['HTML transferred'],
['Requests per second'],
['Time per request'],
['Time per request'],
['Transfer rate']
]

# first, handle metadata
# from the first file only, that's entirely enough
i = 0
with open(sources[0], 'rt') as f:
  for l in f:
    if l[:len(meta[i][0])] == meta[i][0]:
      meta[i].append(re.sub(r'%s: +(.*)[\n\r]*' % meta[i][0], r'\1', l))
      i+=1
    if i >= len(meta):
      break

# we have the metadata, let's write these to a file!
print " - writing metadata to '%smeta.csv'" % sys.argv[1]
write_data('meta.csv', meta)

# now, handle statistical data
# each source file please!
gn = ''
gcol = 1
for s in sources:
  
  # group name
  # name of the current group
  ngn = re.sub(r'(.*\/)?(.+)-[0-9]\.log', r'\2', s)
  # has the group changed?
  if (gn != ngn):
    # handle partsum
    handle_partsum(merged, gcol)
    # ah, it has! use the new group name
    gn = ngn
    # the "group column" (the first column in a group)
    gcol = len(merged[0])
  
  # add the header
  merged[0].append(gn)

  # index of the merged list, skipping the first row (the header)
  i = 1
  
  # on with it!
  with open(s, 'rt') as f:
    for l in f:
      if l[:len(merged[i][0])] == merged[i][0]:
        merged[i].append(re.sub(r'%s: +([0-9,\.]+).*[\n\r]*' % merged[i][0], r'\1', l))
        i+=1
      if i >= len(merged):
        break

# handle dangling gcol
handle_partsum(merged, gcol)


# we have the stats, let's write these to a file!
print " - writing the result to '%smerged-logs.csv'" % sys.argv[1]
write_data('merged-logs.csv', merged)
initial import (and full functionality, I guess?) 2014-02-16 19:50:32 +00:00			`#!/usr/bin/python`
			`import os, sys, csv, glob, re`

			`# check if we got the dirname`
			`if len(sys.argv) < 2:`
			`sys.exit("Please provide the directory to work in.")`

			`# check if exists`
			`if not os.path.exists(sys.argv[1]):`
			`sys.exit("The path '%s' does not seem to exist." % sys.argv[1])`

			`if not os.path.isdir(sys.argv[1]):`
			`sys.exit("The path '%s' does not seem to be a directory." % sys.argv[1])`

			`if sys.argv[1][-1] != '/':`
			`sys.argv[1] += '/'`

			`# info`
			`print "Working in '%s'" % sys.argv[1]`

			`# writing the data out to a csv file`
			`def write_data(f, data):`
			`with open('%s%s' % (sys.argv[1], f), 'wb') as csvfile:`
			`csvw = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)`
			`for row in data:`
			`csvw.writerow(row)`


			`# handling the partial sums/averages`
			`def handle_partsum(data, gcol, gn=None):`

			`# if the gcol is the current column (or does not exist yet) don't do anything, obviously`
			`if not data or gcol >= len(data[0]):`
			`#print 'handle_partsum : gcol >= len(data[0]) : %s >= %s' % (gcol, len(data[0]))`
			`return False;`

			`# indexing the data`
			`i = 0`

			`# add groupname headers`
			`# only used on the CSV part of this script`
			`if data and gn:`
			`data[i].append(gn)`
			`i += 1`

			`# 'average' header, used everywhere`
			`data[i].append('average')`
			`i += 1`

			`# let's iterate through partsum`
			`for i in range(i, len(data)):`

			`# the data to be averaged`
			`partsum = data[i][gcol:]`
			`for j in range(len(partsum)):`
			`partsum[j] = float(partsum[j])`

			`# pretty straightforward -- the average is "0" if there are less than 3 data points...`
			`if len(partsum) < 3:`
			`data[i].append(0)`
			`# ...otherwise it's the average of the datapoints (with minimum and maximum exempt)`
			`else:`
			`data[i].append((sum(partsum) - min(partsum) - max(partsum)) / (len(partsum) - 2))`

			`#`
			`# handling the *.csv files generated by ab`
			`#`

			`print " - handing CSV files"`

			`# get the files`
			`sources = glob.glob('%s*[0-9].csv' % sys.argv[1])`
			`sources.sort()`

			`if not len(sources):`
			`sys.exit("No files matching the required '*[0-9].csv' globbing pattern found in '%s'." % sys.argv[1])`

			`merged = []`


			`# iterate the source files`
			`gn = ''`
			`gcol = 1`
			`for f in sources:`

			`# name of the current group`
			`ngn = re.sub(r'(.*\/)?(.+)-[0-9]\.csv', r'\2', f)`
			`# has the group changed?`
			`if (gn != ngn):`
			`# handle partsum`
			`handle_partsum(merged, gcol, gn)`
			`# ah, it has! use the new group name`
			`gn = ngn`
			`# which is the active gcol?`
			`if merged:`
			`gcol = len(merged[0])`


			`# open the source file`
			`with open(f, 'rb') as csvfile:`

			`# if merged is empty`
			`if not merged:`
			`# headline`
			`merged.append(['Legend', gn])`
			`# iterate the rows`
			`for row in csv.reader(csvfile, delimiter=','):`
			`# and add whole rows to merged`
			`merged.append(row)`

			`# ah, so merged is not empty and already contains the first column`
			`# no need for the non-data column then, eh?`
			`else:`
			`i = 0`
			`merged[i].append(gn)`
			`# iterate the rows`
			`for row in csv.reader(csvfile, delimiter=','):`
			`i+=1`
			`merged[i].append(row[1])`

			`# handle "dangling partsum"`
			`handle_partsum(merged, gcol, gn)`

			`print " - writing the result to '%smerged.csv'" % sys.argv[1]`

			`# write it down`
			`'''with open('%smerged.csv' % sys.argv[1], 'wb') as csvfile:`
			`csvw = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)`
			`for row in merged:`
			`csvw.writerow(row)'''`
			`write_data('merged.csv', merged)`

			`#`
			`# *.log files need to be handled differently`
			`#`

			`print " - handing LOG files"`

			`# get the files`
			`sources = glob.glob('%s*[0-9].log' % sys.argv[1])`
			`sources.sort()`

			`if not len(sources):`
			`sys.exit("No files matching the required '*[0-9].log' globbing pattern found in '%s'." % sys.argv[1])`

			`# the fields`
			`meta = [`
			`['Server Software'],`
			`['Server Hostname'],`
			`['Server Port'],`
			`['Document Path'],`
			`['Document Length'],`
			`['Concurrency Level']`
			`]`
			`merged = [`
			`['Legend'],`
			`['Time taken for tests'],`
			`['Complete requests'],`
			`['Failed requests'],`
			`['Write errors'],`
			`['Non-2xx responses'],`
			`['Total transferred'],`
			`['HTML transferred'],`
			`['Requests per second'],`
			`['Time per request'],`
			`['Time per request'],`
			`['Transfer rate']`
			`]`

			`# first, handle metadata`
			`# from the first file only, that's entirely enough`
			`i = 0`
			`with open(sources[0], 'rt') as f:`
			`for l in f:`
			`if l[:len(meta[i][0])] == meta[i][0]:`
			`meta[i].append(re.sub(r'%s: +(.)[\n\r]' % meta[i][0], r'\1', l))`
			`i+=1`
			`if i >= len(meta):`
			`break`

			`# we have the metadata, let's write these to a file!`
			`print " - writing metadata to '%smeta.csv'" % sys.argv[1]`
			`write_data('meta.csv', meta)`

			`# now, handle statistical data`
			`# each source file please!`
			`gn = ''`
			`gcol = 1`
			`for s in sources:`

			`# group name`
			`# name of the current group`
			`ngn = re.sub(r'(.*\/)?(.+)-[0-9]\.log', r'\2', s)`
			`# has the group changed?`
			`if (gn != ngn):`
			`# handle partsum`
			`handle_partsum(merged, gcol)`
			`# ah, it has! use the new group name`
			`gn = ngn`
			`# the "group column" (the first column in a group)`
			`gcol = len(merged[0])`

			`# add the header`
			`merged[0].append(gn)`

			`# index of the merged list, skipping the first row (the header)`
			`i = 1`

			`# on with it!`
			`with open(s, 'rt') as f:`
			`for l in f:`
			`if l[:len(merged[i][0])] == merged[i][0]:`
			`merged[i].append(re.sub(r'%s: +([0-9,\.]+).[\n\r]' % merged[i][0], r'\1', l))`
			`i+=1`
			`if i >= len(merged):`
			`break`

			`# handle dangling gcol`
			`handle_partsum(merged, gcol)`


			`# we have the stats, let's write these to a file!`
			`print " - writing the result to '%smerged-logs.csv'" % sys.argv[1]`
			`write_data('merged-logs.csv', merged)`