To convert Pandas dataframe to tabular format using tabulate()
import pandas as pd
df = pd .DataFrame ({
"a" : [1 ,2 ,3 ],
"b" : [4 ,5 ,6 ]
})
df_str = tabulate (df , headers = df .columns , tablefmt = "orgtbl" , showindex = False )
print (df_str )
To convert Pandas dataframe to tabular format using pd2org()
import pandas as pd
df = pd .DataFrame ({
"a" : [1 ,2 ,3 ],
"b" : [4 ,5 ,6 ]
})
<< pd2org ("df" )>>
To convert PySpark dataframe to tabular format using ps2org()
import pyspark .sql .functions as F
import pyspark .sql .types as T
from pyspark .sql .window import Window
from pyspark .sql import SparkSession
spark = SparkSession .builder .master ("local" ).appName ("test-app" ).getOrCreate ()
schema = T .StructType (
[
T .StructField ("a" , T .IntegerType (), True ),
T .StructField ("b" , T .IntegerType (), True ),
]
)
data = [(1 , 4 ), (2 , 5 ), (3 , 6 )]
df = spark .createDataFrame (schema = schema , data = data )
<< ps2org ("df" )>>
To convert PySpark dataframe to tabular format using actual and shown code
import pyspark .sql .functions as F
import pyspark .sql .types as T
from pyspark .sql .window import Window
from pyspark .sql import SparkSession
spark = SparkSession .builder .config ("spark.log.level" , "OFF" ).master ("local" ).appName ("test-app" ).getOrCreate ()
<< nostderr ("spark" )>>
schema = T .StructType (
[
T .StructField ("a" , T .IntegerType (), True ),
T .StructField ("b" , T .IntegerType (), True ),
]
)
data = [(1 , 4 ), (2 , 5 ), (3 , 6 )]
df = spark .createDataFrame (schema = schema , data = data )
print ("Dataframe df:" )
<< show2org ("df" )>> df .show ()
Which is converted into the following code block during evaluation:
from tabulate import tabulate
import pyspark .sql .functions as F
import pyspark .sql .types as T
from pyspark .sql .window import Window
from pyspark .sql import SparkSession
spark = SparkSession .builder .master ("local" ).appName ("test-app" ).getOrCreate ()
schema = T .StructType (
[
T .StructField ("a" , T .IntegerType (), True ),
T .StructField ("b" , T .IntegerType (), True ),
]
)
data = [(1 , 4 ), (2 , 5 ), (3 , 6 )]
df = spark .createDataFrame (schema = schema , data = data )
print (df .toPandas ().to_markdown (index = False , tablefmt = 'orgtbl' ))#df.show()
To convert PySpark dataframe to tabular format using returned value and NOWEB
# Built-in namespace
import pyspark .sql .functions as F
import pyspark .sql .types as T
from pyspark .sql import SparkSession
spark = SparkSession .builder .master ("local" ).appName ("test-app" ).getOrCreate ()
schema = T .StructType (
[
T .StructField ("a" , T .IntegerType (), True ),
T .StructField ("b" , T .IntegerType (), True ),
]
)
data = [(1 , 4 ), (2 , 5 ), (3 , 6 )]
df = spark .createDataFrame (schema = schema , data = data )
df << litps2org >> .show ()
| a | b |
|-----+-----|
| 1 | 4 |
| 2 | 5 |
| 3 | 6 |
To convert PySpark dataframe to tabular format using post-processing with AWK
echo " $data " | awk ' BEGIN{state_prev=""; prev_line=""}{ \
if ($0 ~ /^\+[-+]+\+$/){ \
state_curr = "hline" \
} else { \
if ($0 ~ /^\|.*\|$/) { \
state_curr = "tblbody" \
} \
else { \
state_curr = "txt" \
} \
} \
\
if ((state_curr == "hline") && (state_prev == "txt")) { \
printf("%s", prev_line); \
prev_line = ""; \
} else if ((state_curr == "txt") && (state_prev == "hline")) { \
prev_line = $0; \
} else if ((state_curr == "hline") && (state_prev == "")) { \
prev_line = ""; \
} else if ((state_curr == "txt") && (state_prev == "")) { \
printf("%s", prev_line); \
prev_line = gensub(/^\+([-+]+)\+$/, "|\\1|", "g", $0); \
} else { \
if (NR > 2) { \
printf("%s\n", prev_line); \
} \
prev_line = gensub(/^\+([-+]+)\+$/, "|\\1|", "g", $0); \
} \
state_prev = state_curr; \
}END{if (prev_line !~ /^\|.*\|$/) {print prev_line}}'
import pyspark .sql .functions as F
import pyspark .sql .types as T
from pyspark .sql import SparkSession
from tabulate import tabulate
spark = SparkSession .builder .master ("local[1]" ).appName ("test-app" ).getOrCreate ()
schema = T .StructType (
[
T .StructField ("A" , T .ArrayType (T .StringType ()), True ),
T .StructField ("B" , T .ArrayType (T .StringType ()), True ),
]
)
data = [(["b" , "a" , "c" ], ["c" , "d" , "a" , "f" ])]
df = spark .createDataFrame (schema = schema , data = data )
dft = df .select ("A" , "B" ,
F .array_except ("A" , "B" ).alias ("A\B" ),
F .array_except ("B" , "A" ).alias ("B\A" ))
print ("Table 1:" )
dft .show ()
print ("Table 2:" )
dft .show ()
print ("Two tables are the same." )
A B A\B B\A
[b, a, c] [c, d, a, f] [b] [d, f]
Table 2:
A B A\B B\A
[b, a, c] [c, d, a, f] [b] [d, f]
Two tables are the same.
To convert PySpark dataframe to tabular format using post-processing with SED
echo " $data " | sed -E " s/^\+([-+]+)\+$/|\1|/g"
A B A\B B\A
[b, a, c] [c, d, a, f] [b] [d, f]
Table 2:
A B A\B B\A
[b, a, c] [c, d, a, f] [b] [d, f]
Two tables are the same.
To convert PySpark dataframe to tabular format using post-processing with Python
The formatting of PySpark dataframe is done in .showString() .
import re
state_prev = ""
prev_line = ""
for j , line in enumerate (data .split ("\n " )):
if re .match ("^\+[-+]+\+$" , line ):
state_curr = "hline"
elif re .match ("^\|.*\|$" , line ):
state_curr = "tblbody"
else :
state_curr = "txt"
if (state_curr == "hline" ) & (state_prev == "txt" ):
print (prev_line , end = "" )
prev_line = ""
elif (state_curr == "txt" ) & (state_prev == "hline" ):
print ("" , end = "" )
prev_line = line
elif (state_curr == "txt" ) & (state_prev == "" ):
print (prev_line , end = "" )
prev_line = re .sub ("^\+([-+]+)\+$" , "|\\ 1|" , line )
else :
if j > 0 :
print (prev_line , end = "\n " )
prev_line = re .sub ("^\+([-+]+)\+$" , "|\\ 1|" , line )
state_prev = state_curr
if not re .match ("^\|[-+]+\|$" , prev_line ):
print (prev_line )
A B A\B B\A
[b, a, c] [c, d, a, f] [b] [d, f]
Table 2:
A B A\B B\A
[b, a, c] [c, d, a, f] [b] [d, f]
Two tables are the same.
To convert PySpark dataframe to HTML format using a built-in function
import pyspark .sql .functions as F
import pyspark .sql .types as T
from pyspark .sql import SparkSession
from pyspark import SparkConf
# This configuration is needed to enable HTML rendering
conf = SparkConf ().set ("spark.sql.repl.eagerEval.enabled" , "true" )
spark = SparkSession .builder .master ("local[1]" ).appName ("test-app" ).config (conf = conf ).getOrCreate ()
schema = T .StructType (
[
T .StructField ("a" , T .IntegerType (), True ),
T .StructField ("b" , T .IntegerType (), True ),
]
)
data = [(1 , 4 ), (2 , 5 ), (3 , 6 )]
df = spark .createDataFrame (schema = schema , data = data )
print (df ._repr_html_ ())