Dplython 数据分析库 项目简介
Dplython是使用Python语言的Dplyr。Dplyr是一个使用R语言快速分析数据的库。 Dplyr的理念是在一些最常见的任务中限制数据操作的部分功能。这种映射思维更接近编码思维,帮助您在分析数据时提高“思维速度”。安装:pip install git+https://github.com/dodger487/dplython.git使用:from dplython import *
diamonds >> select(X.carat, X.cut, X.price) >> head(5)
"""# Filter out rows using dfilter
diamonds >> dfilter(X.carat > 4) >> select(X.carat, X.cut, X.depth, X.price)
"""# Sample with sample_n or sample_frac, sort with arrange
(diamonds >>
sample_n(10) >>
arrange(X.carat) >>
select(X.carat, X.cut, X.depth, X.price))"""# You can:
# add columns with mutate (referencing other columns!)
# group rows into dplyr-style groups with group_by
# collapse rows into single rows using sumarize
(diamonds >>
mutate(carat_bin=X.carat.round()) >>
group_by(X.cut, X.carat_bin) >>
summarize(avg_price=X.price.mean()))"""# If you have column names that don't work as attributes, you can use an
# alternate "get item" notation with X.
diamonds["column w/ spaces"] = range(len(diamonds))
diamonds >> select(X["column w/ spaces"]) >> head()
"""# It's possible to pass the entire dataframe using X._
diamonds >> sample_n(6) >> select(X.carat, X.price) >> X._.T
"""# To pass the DataFrame or columns into functions, apply @DelayFunction
@DelayFunctiondef PairwiseGreater(series1, series2):
index = series1.index
newSeries = pandas.Series([max(s1, s2) for s1, s2 in zip(series1, series2)])
newSeries.index = index return newSeries
diamonds >> PairwiseGreater(X.x, X.y)# Passing entire dataframe and plotting with ggplotfrom ggplot import *ggplot = DelayFunction(ggplot) # Simple installationdiamonds = DplyFrame(pandas.read_csv('./diamonds.csv')) # Masked in ggplot pkg(diamonds >> ggplot(aes(x="carat", y="price", color="cut"), data=X._) +
geom_point() + facet_wrap("color"))(diamonds >>
dfilter((X.clarity == "I1") | (X.clarity == "IF")) >>
ggplot(aes(x="carat", y="price", color="color"), X._) +
geom_point() +
facet_wrap("clarity"))# Matplotlib works as well!
import pylab as pl
pl.scatter = DelayFunction(pl.scatter)
diamonds >> sample_frac(0.1) >> pl.scatter(X.carat, X.price)
diamonds >> select(X.carat, X.cut, X.price) >> head(5)
"""# Filter out rows using dfilter
diamonds >> dfilter(X.carat > 4) >> select(X.carat, X.cut, X.depth, X.price)
"""# Sample with sample_n or sample_frac, sort with arrange
(diamonds >>
sample_n(10) >>
arrange(X.carat) >>
select(X.carat, X.cut, X.depth, X.price))"""# You can:
# add columns with mutate (referencing other columns!)
# group rows into dplyr-style groups with group_by
# collapse rows into single rows using sumarize
(diamonds >>
mutate(carat_bin=X.carat.round()) >>
group_by(X.cut, X.carat_bin) >>
summarize(avg_price=X.price.mean()))"""# If you have column names that don't work as attributes, you can use an
# alternate "get item" notation with X.
diamonds["column w/ spaces"] = range(len(diamonds))
diamonds >> select(X["column w/ spaces"]) >> head()
"""# It's possible to pass the entire dataframe using X._
diamonds >> sample_n(6) >> select(X.carat, X.price) >> X._.T
"""# To pass the DataFrame or columns into functions, apply @DelayFunction
@DelayFunctiondef PairwiseGreater(series1, series2):
index = series1.index
newSeries = pandas.Series([max(s1, s2) for s1, s2 in zip(series1, series2)])
newSeries.index = index return newSeries
diamonds >> PairwiseGreater(X.x, X.y)# Passing entire dataframe and plotting with ggplotfrom ggplot import *ggplot = DelayFunction(ggplot) # Simple installationdiamonds = DplyFrame(pandas.read_csv('./diamonds.csv')) # Masked in ggplot pkg(diamonds >> ggplot(aes(x="carat", y="price", color="cut"), data=X._) +
geom_point() + facet_wrap("color"))(diamonds >>
dfilter((X.clarity == "I1") | (X.clarity == "IF")) >>
ggplot(aes(x="carat", y="price", color="color"), X._) +
geom_point() +
facet_wrap("clarity"))# Matplotlib works as well!
import pylab as pl
pl.scatter = DelayFunction(pl.scatter)
diamonds >> sample_frac(0.1) >> pl.scatter(X.carat, X.price)