4.4. DataFrame Sample¶
import pandas as pd
import numpy as np
np.random.seed(0)
df = pd.DataFrame(
columns = ['Morning', 'Noon', 'Evening', 'Midnight'],
index = pd.date_range('1999-12-30', periods=7),
data = np.random.randn(7, 4))
df
# Morning Noon Evening Midnight
# 1999-12-30 1.764052 0.400157 0.978738 2.240893
# 1999-12-31 1.867558 -0.977278 0.950088 -0.151357
# 2000-01-01 -0.103219 0.410599 0.144044 1.454274
# 2000-01-02 0.761038 0.121675 0.443863 0.333674
# 2000-01-03 1.494079 -0.205158 0.313068 -0.854096
# 2000-01-04 -2.552990 0.653619 0.864436 -0.742165
# 2000-01-05 2.269755 -1.454366 0.045759 -0.187184
4.4.1. Head¶
df.head(2)
# Morning Noon Evening Midnight
# 1999-12-30 1.764052 0.400157 0.978738 2.240893
# 1999-12-31 1.867558 -0.977278 0.950088 -0.151357
df.head(n=1)
# Morning Noon Evening Midnight
# 1999-12-30 1.764052 0.400157 0.978738 2.240893
4.4.2. Tail¶
df.tail(2)
# Morning Noon Evening Midnight
# 2000-01-04 -2.552990 0.653619 0.864436 -0.742165
# 2000-01-05 2.269755 -1.454366 0.045759 -0.187184
df.tail(n=1)
# Morning Noon Evening Midnight
# 2000-01-05 2.269755 -1.454366 0.045759 -0.187184
4.4.3. First¶
df.first('Y')
# Morning Noon Evening Midnight
# 1999-12-30 1.764052 0.400157 0.978738 2.240893
# 1999-12-31 1.867558 -0.977278 0.950088 -0.151357
df.first('M')
# Morning Noon Evening Midnight
# 1999-12-30 1.764052 0.400157 0.978738 2.240893
# 1999-12-31 1.867558 -0.977278 0.950088 -0.151357
df.first('D')
# Morning Noon Evening Midnight
# 1999-12-30 1.764052 0.400157 0.978738 2.240893
df.first('W')
# Morning Noon Evening Midnight
# 1999-12-30 1.764052 0.400157 0.978738 2.240893
# 1999-12-31 1.867558 -0.977278 0.950088 -0.151357
# 2000-01-01 -0.103219 0.410599 0.144044 1.454274
# 2000-01-02 0.761038 0.121675 0.443863 0.333674
4.4.4. Last¶
df.last('Y')
# Morning Noon Evening Midnight
# 2000-01-01 -0.103219 0.410599 0.144044 1.454274
# 2000-01-02 0.761038 0.121675 0.443863 0.333674
# 2000-01-03 1.494079 -0.205158 0.313068 -0.854096
# 2000-01-04 -2.552990 0.653619 0.864436 -0.742165
# 2000-01-05 2.269755 -1.454366 0.045759 -0.187184
df.last('M')
# Morning Noon Evening Midnight
# 2000-01-01 -0.103219 0.410599 0.144044 1.454274
# 2000-01-02 0.761038 0.121675 0.443863 0.333674
# 2000-01-03 1.494079 -0.205158 0.313068 -0.854096
# 2000-01-04 -2.552990 0.653619 0.864436 -0.742165
# 2000-01-05 2.269755 -1.454366 0.045759 -0.187184
df.last('D')
# Morning Noon Evening Midnight
# 2000-01-05 2.269755 -1.454366 0.045759 -0.187184
df.last('W')
# Morning Noon Evening Midnight
# 2000-01-03 1.494079 -0.205158 0.313068 -0.854096
# 2000-01-04 -2.552990 0.653619 0.864436 -0.742165
# 2000-01-05 2.269755 -1.454366 0.045759 -0.187184
4.4.5. Sample¶
1/4 is 25%
.05 is 5%
0.5 is 50%
1.0 is 100%
n number or fraction random rows with and without repetition:
df.sample()
# Morning Noon Evening Midnight
# 2000-01-01 -0.103219 0.410599 0.144044 1.454274
df.sample(2)
# Morning Noon Evening Midnight
# 2000-01-03 1.494079 -0.205158 0.313068 -0.854096
# 2000-01-04 -2.552990 0.653619 0.864436 -0.742165
df.sample(n=2, replace=True)
# Morning Noon Evening Midnight
# 1999-12-31 1.867558 -0.977278 0.950088 -0.151357
# 1999-12-31 1.867558 -0.977278 0.950088 -0.151357
df.sample(frac=1/4)
# Morning Noon Evening Midnight
# 2000-01-02 0.761038 0.121675 0.443863 0.333674
# 1999-12-31 1.867558 -0.977278 0.950088 -0.151357
df.sample(frac=0.5)
# Morning Noon Evening Midnight
# 2000-01-05 2.269755 -1.454366 0.045759 -0.187184
# 1999-12-30 1.764052 0.400157 0.978738 2.240893
# 2000-01-01 -0.103219 0.410599 0.144044 1.454274
# 1999-12-31 1.867558 -0.977278 0.950088 -0.151357
4.4.6. Reset Index¶
df.sample(frac=1.0).reset_index()
# index Morning Noon Evening Midnight
# 0 2000-01-02 0.761038 0.121675 0.443863 0.333674
# 1 2000-01-03 1.494079 -0.205158 0.313068 -0.854096
# 2 2000-01-01 -0.103219 0.410599 0.144044 1.454274
# 3 1999-12-31 1.867558 -0.977278 0.950088 -0.151357
# 4 2000-01-05 2.269755 -1.454366 0.045759 -0.187184
# 5 2000-01-04 -2.552990 0.653619 0.864436 -0.742165
# 6 1999-12-30 1.764052 0.400157 0.978738 2.240893
import pandas as pd
DATA = [{'sepal_length': 5.4, 'sepal_width': 3.9, 'petal_length': 1.3, 'petal_width': 0.4, 'species': 'setosa'},
{'sepal_length': 5.9, 'sepal_width': 3.0, 'petal_length': 5.1, 'petal_width': 1.8, 'species': 'virginica'},
{'sepal_length': 6.0, 'sepal_width': 3.4, 'petal_length': 4.5, 'petal_width': 1.6, 'species': 'versicolor'},
{'sepal_length': 7.3, 'sepal_width': 2.9, 'petal_length': 6.3, 'petal_width': 1.8, 'species': 'virginica'},
{'sepal_length': 5.6, 'sepal_width': 2.5, 'petal_length': 3.9, 'petal_width': 1.1, 'species': 'versicolor'},
{'sepal_length': 5.4, 'sepal_width': 3.9, 'petal_length': 1.3, 'petal_width': 0.4, 'species': 'setosa'}]
df = pd.read_csv(DATA)
selected = df.sample(frac=0.02)
# sepal_length sepal_width petal_length petal_width species
# 98 5.0 3.0 1.6 0.2 setosa
# 64 5.0 3.5 1.6 0.6 setosa
# 105 6.1 2.8 4.0 1.3 versicolor
selected.reset_index()
# index sepal_length sepal_width petal_length petal_width species
# 0 98 5.0 3.0 1.6 0.2 setosa
# 1 64 5.0 3.5 1.6 0.6 setosa
# 2 105 6.1 2.8 4.0 1.3 versicolor
selected.reset_index(drop=True)
# sepal_length sepal_width petal_length petal_width species
# 0 5.0 3.0 1.6 0.2 setosa
# 1 5.0 3.5 1.6 0.6 setosa
# 2 6.1 2.8 4.0 1.3 versicolor
4.4.7. Assignments¶
"""
* Assignment: DataFrame Sample
* Complexity: easy
* Lines of code: 4 lines
* Time: 8 min
English:
TODO: English Translation
Polish:
1. Użyj danych z sekcji "Given" (patrz poniżej)
2. Wczytaj dane z `DATA` jako `df: pd.DataFrame`
3. Ustaw wszystkie wiersze w losowej kolejności
4. Zresetuj index nie pozostawiając kopii zapasowej starego
5. Zdefiniuj `result` z ostatnimi 10% wierszy
Tests:
>>> type(result) is pd.DataFrame
True
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.max_columns', 10)
>>> pd.set_option('display.max_rows', 10)
>>> result # doctest: +NORMALIZE_WHITESPACE
Name Country Gender Flights Total Flights Total Flight Time (ddd:hh:mm)
0 Viktor Patsayev Soviet Union Man Soyuz 11 (1971) 1 023:21:21
1 Stephen G. Bowen United States Man STS-126 (2008), STS-132 (2010), STS-133 (2011) 3 040:10:04
2 Sergei Revin Russia Man Soyuz TMA-04M (2012) 1 124:23:51
3 Maksim Surayev Russia Man Soyuz TMA-16 (2009), Soyuz TMA-13M (2014) 2 334:12:09
4 Andrew Thomas United States Man STS-77 (1996), STS-89 (1998), STS-102 (2001), ... 4 177:09:14
.. ... ... ... ... ... ...
562 Lawrence J. DeLucas United States Man STS-50 (1992) 1 013:19:30
563 Aleksandr Laveykin Soviet Union Man Soyuz TM-2 (1987) 1 174:03:25
564 Owen Garriott United States Man Skylab 3 (1973), STS-9 (1983) 2 069:17:56
565 Ivan Vagner Russia Man Soyuz MS-16 (2020) 1 145:04:14
566 Yuri Malenchenko Russia Man Soyuz TM-19 (1994), STS-106 (2000), Soyuz TMA-... 6 826:09:22
<BLANKLINE>
[567 rows x 6 columns]
"""
# Given
import pandas as pd
import numpy as np
np.random.seed(0)
DATA = r'https://raw.githubusercontent.com/AstroMatt/book-python/master/_data/csv/astro-database.csv'
result = ...