背景是这样的,阿里云上的数据目前已经买了2TB的磁盘空间了,而且已经到了上限,无法扩容了. 所以才会把数据库中的不常用数据使用此方式进行归档.
目前主要的思路是:
1 按照顺序读取每天的数据,写成对应的csv.
2 自动压缩csv为zip,删除csv.
3 删除数据库中的数据.
看主要的代码:
#!/usr/bin/python# -*- coding: UTF-8 -*-import pymssqlimport csvimport datetimeimport os,sysimport zipfileimport mathimport getoptclass RunBack:
tableName = "VendorAPILog"
headerName = ["ID","AppSid","RequestDateTime","RequestConTime","VendorAPIName","RequestUrl","RequestContent","ResponseContent","DateYear","DateMonth","DateDay"];
maxid = 0;
rowids = [] def __init__(self,curdate): print(curdate)
self.date = datetime.datetime.strptime(curdate, "%Y-%m-%d")
self.dbaccess()
self.filepath = os.path.dirname(os.path.realpath(__file__))
self.filepath = os.path.join(self.filepath,"dblog")
def dbaccess(self):
self.db = pymssql.connect(host="xxxxxxxxxx.sqlserver.rds.aliyuncs.com",port=3433, user="username", password="pwd", database="tablename", charset='utf8')
self.cursor = self.db.cursor() def process(self):
if os.path.exists(self.filepath): #os.makedirs(filepath)
print("exists") else:
os.makedirs(self.filepath)
self.count=self.reminecount() if (self.count>0):
filename = "db_name_%s.csv"%(self.date.strftime("%Y%m%d"))
fullfilename = os.path.join(self.filepath,filename)
self.csvfile = open(fullfilename, 'w',newline='',encoding='utf_8')
self.csvwrite = csv.writer(self.csvfile)
self.appendheader() while self.count>0:
self.appendrow()
self.count = self.reminecount()
self.cursor.close()
self.db.close()
self.csvfile.close() #压缩存储 self.zip() def appendheader(self):
self.csvwrite.writerow(self.headerName)
def appendrow(self): #读取数据库
print("获取数据")
self.cursor.execute("select top 2000 * from %s where DateYear=%d and DateMonth=%d and DateDay=%d and id>%d order by id" %(self.tableName,self.date.year,self.date.month,self.date.day,self.maxid))
rows = self.cursor.fetchall() print(self.maxid) #批量转成csv
for row in rows:
self.maxid = max(row[0],self.maxid)
self.rowids.append([row[0],row[9]])
self.csvwrite.writerow(row)
print(self.maxid)
self.delrow() def reminecount(self): print("获取剩余行数")
self.cursor.execute("select count(*) from %s where DateYear=%d and DateMonth=%d and DateDay=%d and id>%d" %(self.tableName,self.date.year,self.date.month,self.date.day,self.maxid))
row = self.cursor.fetchone() print(row[0]) return row[0]
def zip(self):
filename = "db_name_%s.zip"%(self.date.strftime("%Y%m%d"))
fullfilename = os.path.join(self.filepath,filename)
file = zipfile.ZipFile(fullfilename,'w',compression=zipfile.ZIP_LZMA) print(fullfilename)
filename3 = filename.replace("zip","csv")
filename2 =os.path.join("dblog",filename3) print(filename2)
file.write(filename2,filename3)
os.remove(filename2) def delrow(self): if len(self.rowids)>0: for rowid in self.rowids: #print(rowid)
sql = "delete from %s where id=%d and DateMonth=%d;"%(self.tableName,rowid[0],rowid[1]) print(sql)
self.cursor.execute(sql)
self.db.commit()
self.rowids = []
run = RunBack(sys.argv[1])
run.process()#run.zip()最后需要大家注意的是:
不要再归档的过程中造成数据库的io压力,以免影响生产数据的性能.
因为我这边用的阿里云,我也处理好了索引,所以我的归档过程中对生产库几乎没什么影响.
阿里云数据库的磁盘 iops在3000左右,磁盘读写在50m/s左右.除了数据量很大,实际上系统运行的压力不是很大,而且我这个是库是日志库基本上是只写不读实际的情况,大家要自己去评估去衡量.