8-15 908 views
创建模拟大数据
表结构
DROP TABLE IF EXISTS `t_user`;
CREATE TABLE `t_user` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`id_time` bigint(20) unsigned NOT NULL,
`id_uuid` varchar(36) NOT NULL,
`sequence` int(11) unsigned NOT NULL,
`account` varchar(20) NOT NULL,
`password` varchar(32) DEFAULT NULL,
`name` varchar(40) DEFAULT NULL,
`email` varchar(32) NOT NULL,
`phone` varchar(15) NOT NULL,
`qq` varchar(15) DEFAULT NULL,
`apiKey` varchar(40) DEFAULT NULL,
`balance` float DEFAULT NULL,
`birthday` date DEFAULT NULL,
`createdAt` datetime DEFAULT NULL,
`updatedAt` datetime DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
使用Python脚本创建1000W模拟数据,并每隔10W条数据打印一次耗时。
import MySQLdb
import random
import string
import hashlib
import uuid
from datetime import *
import time
LETTER_LOWER = 'abcdefghijklmnopqrstuvwxyz'
LETTER_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
DIGIT = '0123456789'
DIGITx4 = DIGIT+DIGIT+DIGIT+DIGIT
EMAIL = ['@gmail.com', '@qq.com', '@163.com', '@fly3w.com', '@live.com', '@sina.cn',
'@sina.com', '@hotmail.com', '@foxmail.com', '@yunyu.com', '@rayvision.com',
'@love.com', '@aliyun.com', '@outlook.com', '@tom.com', '@126.com', '@198.com']
LETTERS = LETTER_LOWER + LETTER_UPPER
STRING = LETTERS + DIGIT
MD5 = hashlib.md5()
def random_string(i):
return string.join(random.sample(STRING, i)).replace(' ','')
def random_digit(i):
if i > 40 :
i = 40
return string.join(random.sample(DIGITx4, i)).replace(' ','')
def random_letter_lower(i):
return string.join(random.sample(LETTER_LOWER, i)).replace(' ','')
def random_letter_upper(i):
return string.join(random.sample(LETTER_UPPER, i)).replace(' ','')
def random_account():
return random_string(random.randint(3,20))
def random_password():
p = random_string(random.randint(6,20))
MD5.update(p)
return MD5.hexdigest()
def random_email():
return random_string(random.randint(4,15)) + string.join(random.sample(EMAIL, 1))
def random_name():
return random_string(random.randint(4,10)) + '.' + random_string(random.randint(4,10))
def random_phone():
return '1' + random_digit(10)
def random_qq():
return random_digit(random.randint(6,13))
def random_key():
return random_string(40)
def random_balance():
return float(random.uniform(0.0, 999999))
def random_createdAt():
return datetime.fromtimestamp(random.randint(1000000000, 1470987994))
def random_updatedAt():
return datetime.fromtimestamp(random.randint(1000000000, 1470987994))
def random_birthday():
return date.fromtimestamp(random.randint(100000000, 1000000000))
try:
conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='bigdata',port=3306)
cur=conn.cursor()
R1 = 100
R2 = 100
R3 = 1000
TIME = time.time()
for k in range(R1):
time1 = time.clock()
for j in range(R2):
values=[]
for i in range(R3):
sequence = k*R2*R3 + j*R3 + i + 1
id_time = TIME + sequence
id_uuid = uuid.uuid1()
account = random_account()
password = random_password()
name = random_name()
email = random_email()
phone = random_phone()
qq = random_qq()
apiKey = random_key()
balance = random_balance()
birthday = random_birthday()
createdAt = random_createdAt()
updatedAt = random_updatedAt()
values.append((id_time, id_uuid, sequence, account, password, name, email, phone, qq, apiKey, balance, birthday, createdAt, updatedAt))
# print i , j , k , sequence
cur.executemany('insert into t_user (id_time, id_uuid, sequence, account, password, name, email, phone, qq, apiKey, balance, birthday, createdAt, updatedAt) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',values)
cur.executemany('insert into t_user_opt (id_time, id_uuid, sequence, account, password, name, email, phone, qq, apiKey, balance, birthday, createdAt, updatedAt) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',values)
conn.commit()
time2 = time.clock()
# print "insert %d records cost %s seconds"%((i+1)*(j+1)*(k+1) ,time2 - time1)
print "insert %d records cost %s seconds"%((i+1)*(j+1)*(k+1) ,time2 - time1)
cur.close()
conn.close()
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
注释掉插入提交语句,看看空跑需要的时间:
insert 1800000 records cost 13.9562696992 seconds
insert 1900000 records cost 13.4804223272 seconds
insert 2000000 records cost 13.3164563912 seconds
insert 2100000 records cost 13.2930246514 seconds
insert 2200000 records cost 13.3073816569 seconds
insert 2300000 records cost 14.0792722724 seconds
insert 2400000 records cost 13.3529577188 seconds
insert 2500000 records cost 13.2821370229 seconds
insert 2600000 records cost 13.4949583231 seconds
insert 2700000 records cost 13.1573338714 seconds
insert 2800000 records cost 13.168848377 seconds
insert 2900000 records cost 13.8299750973 seconds
insert 3000000 records cost 13.7217753637 seconds
完整插入提交数据所花费的时间:
..........
insert 1800000 records cost 36.0650906447 seconds
insert 1900000 records cost 35.2314557097 seconds
insert 2000000 records cost 35.5520021725 seconds
insert 2100000 records cost 35.7447798253 seconds
insert 2200000 records cost 36.0288146973 seconds
insert 2300000 records cost 36.7669691414 seconds
insert 2400000 records cost 35.8457380811 seconds
..........
insert 8800000 records cost 33.5214939874 seconds
insert 8900000 records cost 34.2245814086 seconds
insert 9000000 records cost 34.2755526372 seconds
insert 9100000 records cost 34.2524004673 seconds
insert 9200000 records cost 34.9790082697 seconds
insert 9300000 records cost 35.7389869879 seconds
insert 9400000 records cost 35.0853273719 seconds
insert 9500000 records cost 35.8309244529 seconds
..........