txt与xls代码处理示例
1. 背景
最近因为服务器训练时间不稳定的问题,LZ就想找出具体时间消耗的原因,但是训练平台只能保存成txt,才能下载,于是LZ把所有的数据都保存成txt的形式,当然如果
小伙伴在保存文本时,使用’\t’来保存,这样下载对应的文本后,就可以直接手动复制到excel表格中,这也是一种方式。
2. 环境配置
conda install xlrd
conda install xlwt
2. txt2excel示例
主要显示了怎么使用xlwt建立新的workbook,增加sheet,传入数据,并且保存对应的excel
def txt2xls():
epoch_lists = glob.glob(log_path_root + "*.txt")
print("the length of epoch lists: ", len(epoch_lists))
wb = xlwt.Workbook(encoding='ascii')
total_sheet = wb.add_sheet("total_part")
for epoch_log_path in epoch_lists:
f = open(epoch_log_path, 'r')
# 获取对应的测试名称,并且增加对应sheet
test_name = epoch_log_path.split("/")[-1].split(".")[0]
part_sheet = wb.add_sheet(test_name + "_epoch")
part_sheet.write(0, 0, label='num_epoch')
part_sheet.write(0, 1, label='data_time(min)')
part_sheet.write(0, 2, label='batch_time(min)')
part_sheet.write(0, 3, label='read_img(min)')
part_sheet.write(0, 4, label='hsv_aug(min)')
part_sheet.write(0, 5, label='resize_img(min)')
part_sheet.write(0, 6, label='random_affine(min)')
part_sheet.write(0, 7, label='contiguousarray(min)')
part_sheet.write(0, 8, label='transforms(min)')
part_sheet.write(0, 9, label='det_gt(min)')
# 获得对应行的数据,并进行分析
lines = f.readlines()
row_count = 1
for line in lines:
split_list = line.strip().split("|")
# print(len(split_list))
# print(split_list[6:-1])
col_count = 0
part_sheet.write(row_count, col_count, label=row_count - 1)
for split_chunk in split_list[6:-1]:
tmp = split_chunk.split(" ")[2]
col_count += 1
part_sheet.write(row_count, col_count, label=tmp)
row_count += 1
f.close()
iter_lists = glob.glob(log_path_root + "*.log")
print("the length of epoch lists: ", len(iter_lists))
for iter_log_path in iter_lists:
f = open(iter_log_path, 'r')
# 获取对应的测试名称,并且增加对应sheet
test_name = iter_log_path.split("/")[-1].split(".")[0].split("_")
print(test_name)
part_sheet = wb.add_sheet(test_name[-2] + "_" + test_name[-1] + "_iter")
part_sheet.write(0, 0, label='num_iter')
part_sheet.write(0, 1, label='read_img(s)')
part_sheet.write(0, 2, label='hsv_aug(s)')
part_sheet.write(0, 3, label='resize_img(s)')
part_sheet.write(0, 4, label='random_affine(s)')
part_sheet.write(0, 5, label='contiguousarray(s)')
part_sheet.write(0, 6, label='transforms(s)')
part_sheet.write(0, 7, label='det_gt(s)')
# 获得对应行的数据,并进行分析
lines = f.readlines()
print("the length of lines: ", len(lines))
# 在这个里LZ做了一个随机采样,去掉前1000次打迭代,等训练稳定以后,在随机采样时间
random_select_1000 = random.sample(range(1000, len(lines) - 2), 1002)
# print(random_select_10000)
row_count = 1
for idx in random_select_1000:
line = lines[idx]
split_list = line.strip().split("|")
if len(split_list) < 5:#防止出现一些不符合要求的输出,这个只是做个简单判断
continue
# print(len(split_list))
# print(split_list[6:-1])
col_count = 0
label_tmp = "{}(idex:{})".format(row_count - 1, idx)
part_sheet.write(row_count, col_count, label=label_tmp)
for split_chunk in split_list[11:]:
tmp = split_chunk.split(" ")[2][0:-1]
# print(tmp)
col_count += 1
part_sheet.write(row_count, col_count, label=tmp)
row_count += 1
f.close()
wb.save(log_path_root + 'timeconsuming.xls')
print("Done!")
数据保存形式,epoch的文档格式
epoch: 1 |loss -1.482269 | hm_loss 0.239190 | wh_loss 0.390505 | off_loss 0.110499 | id_loss 0.029421 | data_time 261.601979 | time 384.200000 | read_img 1610.162382 | hsv_aug 1167.812609 | resize_img 29.079759 | random_affine 33.654764 | contiguousarray 9.320365 | transforms 16.844176 | det_gt 19.504466 |
log的文档格式
test_CPU0_419| train: [1][0/10157]|Tot: 0:00:49 |ETA: 0:00:00 |loss 0.2126 |hm_loss 0.2379 |wh_loss 0.3552 |off_loss 0.1091 |id_loss 0.3125 |Data 34.718s(34.718s) |Net 49.731s | read_img 13.838s | hsv_aug 18.123s | resize_img 0.624s | random_affine 0.662s | contiguousarray 0.068s | transforms 0.152s | det_gt 0.144s
3. 解析excel
主要显示了怎么使用xlrd打开workbook, 读取对应sheet names,获得sheet里面的数据并进行处理
def parseXls():
workbook = xlrd.open_workbook(log_path_root + 'timeconsuming.xls')
wb_parse = xlwt.Workbook(encoding='ascii')
# 配置epoch_parse表头
epoch_parse = wb_parse.add_sheet("epoch_parse")
epoch_parse.write(0, 0, label='test_name')
epoch_parse.write(0, 1, label='data_time(min)')
epoch_parse.write(0, 2, label='batch_time(min)')
epoch_parse.write(0, 3, label='read_img(min)')
epoch_parse.write(0, 4, label='hsv_aug(min)')
epoch_parse.write(0, 5, label='resize_img(min)')
epoch_parse.write(0, 6, label='random_affine(min)')
epoch_parse.write(0, 7, label='contiguousarray(min)')
epoch_parse.write(0, 8, label='transforms(min)')
epoch_parse.write(0, 9, label='det_gt(min)')
# 配置iter_parse表头
iter_parse = wb_parse.add_sheet("iter_parse")
iter_parse.write(0, 0, label='test_name')
iter_parse.write(0, 1, label='read_img(s)')
iter_parse.write(0, 2, label='hsv_aug(s)')
iter_parse.write(0, 3, label='resize_img(s)')
iter_parse.write(0, 4, label='random_affine(s)')
iter_parse.write(0, 5, label='contiguousarray(s)')
iter_parse.write(0, 6, label='transforms(s)')
iter_parse.write(0, 7, label='det_gt(s)')
print(workbook.sheet_names())
row_epoch = 1
row_iter = 1
for sheet_name in workbook.sheet_names():
# for sheet_name in ["CPU0_138_iter"]:
print(sheet_name)
sheet = workbook.sheet_by_name(sheet_name) # 获取对应sheetname的数据
print(sheet.name, sheet.nrows, sheet.ncols)
if 'epoch' in sheet_name:
epoch_parse.write(row_epoch, 0, label=sheet_name)
for col_idx in range(1, sheet.ncols):
col_values = sheet.col_values(col_idx)[1:]
tmp = np.array([float(item) for item in col_values])
print(tmp, np.mean(tmp), np.min(tmp), np.max(tmp))
# label_new = "{:.02f}(min: {:.02f}, max:{:.02f}, std:{:.02f})".format(np.mean(tmp), np.min(tmp),
# np.max(tmp), np.std(tmp))
# label_new = "{:.02f}".format(np.mean(tmp))
epoch_parse.write(row_epoch, col_idx, np.mean(tmp))
row_epoch += 1
elif 'iter' in sheet_name:
iter_parse.write(row_iter, 0, label=sheet_name)
for col_idx in range(1, sheet.ncols):
col_values = sheet.col_values(col_idx)[1:]
tmp = np.array([float(item) for item in col_values])
print(np.mean(tmp), np.min(tmp), np.max(tmp))
# label_new = "{:.02f}(min: {:.02f}, max:{:.02f}, std:{:.02f})".format(np.mean(tmp), np.min(tmp),
# np.max(tmp), np.std(tmp))
# label_new = "{:.02f}".format(np.mean(tmp))
iter_parse.write(row_iter, col_idx, np.mean(tmp))
row_iter += 1
wb_parse.save(log_path_root + 'timeconsuming_parse.xls')
print("Done!")
print("XLS file parse successfully!")
PS:小学的课本上,有袁隆平爷爷弯腰研究杂交水稻的照片。2021年5月22日他走了。看到很多公众号,朋友圈里都在缅怀这一位老人,lz也是打心眼里佩服他,一心为了人类的事业,奉献出自己的一生,是行动的巨人,时代的楷模,这才是真正的偶像吧!