pandas多种格式数据加载
pandas多种格式数据加载
- 在我们实际场景中,我们会在不同地方遇到各种不同数据格式(比如大家熟悉的CSV格式,txt格式,HTML格式,XML格式等等),我们如何用python和这些数据打交道呢?
1.不同格式文本的数据读取
1.1csv读取:
- 正常读取
import pandas as pd df = pd.read_csv("./demo.csv")
- 分隔符读取
df = pd.read_table("./demo.csv",sep=',')
- 不要headers读取(第一行字段)
pd.read_csv("./demo.csv",header=None)
- 指定第一行
pd.read_csv("./demo.csv",names=['a','b','c','d','message'])
- 指定一个索引字段
names=['a','b','c','d',"message"] #指定一个索引字段index_col pd.read_csv("./demo.csv",names=names,index_col="message")
- 跳过某行读取
#表示跳过0,2,3行 df = pd.read_csv("./demo.txt",skiprows=[0,2,3])
- 去除掉NaN的列读取
#去除掉message列不NaN的行 result = pd.read_csv("./demo.csv") result[result.message.isnull()!=True]
- 读取前5行
df = pd.read_csv("./demo.csv",nrow=5)
- 指定chunksize大小读取
chunker = pd.read_csv('./demo.csv',chunksize=100)
1.2数据的写入
- 数据写入csv文件中
data.to_csv("outer.csv")
- 数据在终端打印,以
|
作为分隔符
data.to_csv(sys.stdout,sep="|")
- 当某个数据为空,指定字段做替换
#指定NULL做替换 data.to_csv(sys.stdout,na_rep="NULL")
- 去除header
data.to_csv(sys.stdout,index=False,header=False)
- 指定列(colums)
data.to_csv(sys.stdout,index=False,columns=['a','b','c'])
1.3txt文件读取
- 读到一个列表中
list(open("./demo.txt"))
- 以一个或多个空格作为分割
df = pd.read_table("./demo.txt",sep='\s+')
1.4手动读取
- 当csv文件特变大,需要手动读取
import csv fp = open("demo.csv") read = csv.reader(fp) for line in read: print(line) fp.close()
1.5json格式读取
import json res = json.dumps(obj,ensure_ascii=False)
1.6xml格式解析
from lxml import objectify
1.7与时间相关,输出时间
#表示出2000-1-1开始后38天 import pandas as pd import numpy as np from pandas import Series,DataFrame dates = pd.date_range("1/1/2000",periods=38) ts = Series(np.arange(38),index=dates) ts
2数据库相关操作
2.1 sqlite数据库
- 创表
import sqlite3 query = """ CREATE TABLE test(a VARCHAR(20),b VARCHAR(20),c REAL,d INTEGER); """ con = sqlite3.connect(":memory:") con.execute(query) con.commit()
- 填入数据
data = [("Atlanta","Georgia",1.25,6),("Tallahassee","Florida",2.6,3),("Sacramento","California",1.7,5)] stmt = "INSERT INTO test VALUES(?,?,?,?)" con.executemany(stmt,data) con.commit()
- 查询
cursor = con.execute("select * from test") rows = cursor.fetchall()
- 将从数据库读取的数据,变换成DataFrame
#cursor.description 为游标描述 DataFrame(rows,columns=list(zip(*cursor.description))[0])
2.2MySQL数据库
#coding=utf-8 import pymysql conn = pymysql.connect(host='localhost',port=3306,user="root",passwd="123",db="day39") cur = conn.cursor() #查询 cur.execute("select * from e1") res = cur.fetchall() res #创建数据表 cur.execute("create table stud(id int,name varchar(20),class varchar(30),age varchar(10))") #插入一条数据 cur.execute("insert into stud values(1,'Tom','3year2class','9')") #修改数据 cur.execute("update stud set age='10' where name='Tom'") #删除数据: cur.execute("delete from stud where age='9'") conn.commit() cur.close() conn.close()
2.3Memcache
#coding:utf8 import memcache class MemcachedClient(): ''' python memcached 客户端操作示例 ''' def __init__(self, hostList): self.__mc = memcache.Client(hostList); def set(self, key, value): result = self.__mc.set("name", "NieYong") return result def get(self, key): name = self.__mc.get("name") return name def delete(self, key): result = self.__mc.delete("name") return result if __name__ == '__main__': mc = MemcachedClient(["127.0.0.1:11511", "127.0.0.1:11512"]) key = "name" result = mc.set(key, "NieYong") print "set的结果:", result name = mc.get(key) print "get的结果:", name result = mc.delete(key) print "delete的结果:", result
2.4MongoDB
#encoding:utf=8 import pymongo connection=pymongo.Connection('10.32.38.50',27017) #选择myblog库 db=connection.myblog # 使用users集合 collection=db.users # 添加单条数据到集合中 user = {"name":"cui","age":"10"} collection.insert(user) #同时添加多条数据到集合中 users=[{"name":"cui","age":"9"},{"name":"cui","age":"11"}] collection.insert(users) #查询单条记录 print collection.find_one() #查询所有记录 for data in collection.find(): print data #查询此集合中数据条数 print collection.count() #简单参数查询 for data in collection.find({"name":"1"}): print data #使用find_one获取一条记录 print collection.find_one({"name":"1"}) #高级查询 print "__________________________________________" print '''''collection.find({"age":{"$gt":"10"}})''' print "__________________________________________" for data in collection.find({"age":{"$gt":"10"}}).sort("age"): print data # 查看db下的所有集合 print db.collection_names()
3.API交互
import requests url = "https://api.github.com/repositories/858127/milestones/28/labels" res = requests.get(url) df = DataFrame(res)
相关推荐
xirongxudlut 2020-07-20
wangquannuaa 2020-10-15
jzlixiao 2020-05-15
QianYanDai 2020-05-07
三石 2020-10-30
roamer 2020-10-29
三石 2020-10-29
wangquannuaa 2020-09-29
jzlixiao 2020-09-15
wangquannuaa 2020-08-30
三石 2020-08-23
逍遥友 2020-08-21
jzlixiao 2020-08-18
wangquannuaa 2020-08-17
QianYanDai 2020-08-16
cjsyrwt 2020-08-14
jzlixiao 2020-07-29
mmmjyjy 2020-07-16
QianYanDai 2020-07-05