<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
  <channel>
    <title>river의 기술 로그</title>
    <link>https://seoyoyo.tistory.com/</link>
    <description>개발에 관심이 있는 데이터 분석가</description>
    <language>ko</language>
    <pubDate>Fri, 8 May 2026 11:38:48 +0900</pubDate>
    <generator>TISTORY</generator>
    <ttl>100</ttl>
    <managingEditor>riverruns</managingEditor>
    <image>
      <title>river의 기술 로그</title>
      <url>https://tistory1.daumcdn.net/tistory/4356380/attach/f190b47a009a4ebea32c66c86ba8bf53</url>
      <link>https://seoyoyo.tistory.com</link>
    </image>
    <item>
      <title>[Spark SQL] Date and Time Functions</title>
      <link>https://seoyoyo.tistory.com/11</link>
      <description>&lt;p data-ke-size=&quot;size16&quot;&gt;(Subset of) Standard Functions for Date and Time&lt;/p&gt;
&lt;table id=&quot;functions&quot; style=&quot;border-collapse: collapse; width: 100%;&quot; border=&quot;1&quot; data-ke-align=&quot;alignLeft&quot;&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;span&gt;&lt;a href=&quot;https://jaceklaskowski.gitbooks.io/mastering-spark-sql/content/spark-sql-functions-datetime.html#current_date&quot;&gt;current_date&lt;/a&gt;&lt;/span&gt;&lt;/td&gt;
&lt;td&gt;&lt;span&gt;Gives current date as a date column&lt;/span&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;span&gt;&lt;a href=&quot;https://jaceklaskowski.gitbooks.io/mastering-spark-sql/content/spark-sql-functions-datetime.html#current_timestamp&quot;&gt;current_timestamp&lt;/a&gt;&lt;/span&gt;&lt;/td&gt;
&lt;td&gt;&amp;nbsp;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;span&gt;&lt;a href=&quot;https://jaceklaskowski.gitbooks.io/mastering-spark-sql/content/spark-sql-functions-datetime.html#date_format&quot;&gt;date_format&lt;/a&gt;&lt;/span&gt;&lt;/td&gt;
&lt;td&gt;&amp;nbsp;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;span&gt;&lt;a href=&quot;https://jaceklaskowski.gitbooks.io/mastering-spark-sql/content/spark-sql-functions-datetime.html#to_date&quot;&gt;to_date&lt;/a&gt;&lt;/span&gt;&lt;/td&gt;
&lt;td&gt;&lt;span&gt;Converts column to date type (with an optional date format)&lt;/span&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;span&gt;&lt;a href=&quot;https://jaceklaskowski.gitbooks.io/mastering-spark-sql/content/spark-sql-functions-datetime.html#to_timestamp&quot;&gt;to_timestamp&lt;/a&gt;&lt;/span&gt;&lt;/td&gt;
&lt;td&gt;&lt;span&gt;Converts column to timestamp type (with an optional timestamp format)&lt;/span&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;span&gt;&lt;a href=&quot;https://jaceklaskowski.gitbooks.io/mastering-spark-sql/content/spark-sql-functions-datetime.html#unix_timestamp&quot;&gt;unix_timestamp&lt;/a&gt;&lt;/span&gt;&lt;/td&gt;
&lt;td&gt;&lt;span&gt;Converts current or specified time to Unix timestamp (in seconds)&lt;/span&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;span&gt;&lt;a href=&quot;https://jaceklaskowski.gitbooks.io/mastering-spark-sql/content/spark-sql-functions-datetime.html#window&quot;&gt;window&lt;/a&gt;&lt;/span&gt;&lt;/td&gt;
&lt;td&gt;&lt;span&gt;Generates time windows (i.e. tumbling, sliding and delayed windows)&lt;/span&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;current_date&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;function gives the current date as a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;a href=&quot;https://jaceklaskowski.gitbooks.io/mastering-spark-sql/content/spark-sql-DataType.html#DateType&quot;&gt;date&lt;/a&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;column.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;Internally,&amp;nbsp;&lt;b&gt;date_format&lt;/b&gt;&amp;nbsp;creates a&amp;nbsp;&lt;a href=&quot;https://jaceklaskowski.gitbooks.io/mastering-spark-sql/content/spark-sql-Column.html&quot;&gt;Column&lt;/a&gt;&amp;nbsp;with&amp;nbsp;DateFormatClass&amp;nbsp;binary expression.&amp;nbsp;DateFormatClass&amp;nbsp;takes the expression from&amp;nbsp;dateExpr&amp;nbsp;column and&amp;nbsp;format.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;current_timestamp&lt;/b&gt;&amp;nbsp;is also&amp;nbsp;&lt;b&gt;now&lt;/b&gt;&amp;nbsp;function in SQL.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;h3 id=&quot;__a_id_unix_timestamp_a_converting_current_or_specified_time_to_unix_timestamp_code_unix_timestamp_code_function&quot; data-ke-size=&quot;size23&quot;&gt;Converting Current or Specified Time to Unix Timestamp&amp;thinsp;&amp;mdash;&amp;thinsp;unix_timestamp&lt;span&gt;&amp;nbsp;&lt;/span&gt;Function&lt;/h3&gt;
&lt;ol style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;Gives current timestamp (in seconds)&lt;/li&gt;
&lt;li&gt;Converts&lt;span&gt;&amp;nbsp;&lt;/span&gt;time&lt;span&gt;&amp;nbsp;&lt;/span&gt;string in format&lt;span&gt;&amp;nbsp;&lt;/span&gt;yyyy-MM-dd HH:mm:ss&lt;span&gt;&amp;nbsp;&lt;/span&gt;to Unix timestamp (in seconds)unix_timestamp&lt;span&gt;&amp;nbsp;&lt;/span&gt;converts the current or specified&lt;span&gt;&amp;nbsp;&lt;/span&gt;time&lt;span&gt;&amp;nbsp;&lt;/span&gt;in the specified&lt;span&gt;&amp;nbsp;&lt;/span&gt;format&lt;span&gt;&amp;nbsp;&lt;/span&gt;to a Unix timestamp (in seconds).&lt;/li&gt;
&lt;li&gt;unix_timestamp&lt;span&gt;&amp;nbsp;&lt;/span&gt;supports a column of type&lt;span&gt;&amp;nbsp;&lt;/span&gt;Date,&lt;span&gt;&amp;nbsp;&lt;/span&gt;Timestamp&lt;span&gt;&amp;nbsp;&lt;/span&gt;or&lt;span&gt;&amp;nbsp;&lt;/span&gt;String.&lt;/li&gt;
&lt;li&gt;unix_timestamp&lt;span&gt;&amp;nbsp;&lt;/span&gt;returns&lt;span&gt;&amp;nbsp;&lt;/span&gt;null&lt;span&gt;&amp;nbsp;&lt;/span&gt;if conversion fails.&lt;/li&gt;
&lt;/ol&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>Spark(Scala and PySpark)</category>
      <author>riverruns</author>
      <guid isPermaLink="true">https://seoyoyo.tistory.com/11</guid>
      <comments>https://seoyoyo.tistory.com/11#entry11comment</comments>
      <pubDate>Sat, 25 Sep 2021 06:21:10 +0900</pubDate>
    </item>
    <item>
      <title>[SQL] collect_list() 함수</title>
      <link>https://seoyoyo.tistory.com/10</link>
      <description>&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-origin-width=&quot;300&quot; data-origin-height=&quot;300&quot; data-filename=&quot;sql_jpg.jpg&quot; data-ke-mobilestyle=&quot;widthOrigin&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/mPIGy/btq434cB1dx/lQkayLK2mcGN0XXeM2dBKk/img.jpg&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/mPIGy/btq434cB1dx/lQkayLK2mcGN0XXeM2dBKk/img.jpg&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/mPIGy/btq434cB1dx/lQkayLK2mcGN0XXeM2dBKk/img.jpg&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FmPIGy%2Fbtq434cB1dx%2FlQkayLK2mcGN0XXeM2dBKk%2Fimg.jpg&quot; data-origin-width=&quot;300&quot; data-origin-height=&quot;300&quot; data-filename=&quot;sql_jpg.jpg&quot; data-ke-mobilestyle=&quot;widthOrigin&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;collect_list()&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;: 한 개의 id 기준으로 여러가지 값을 가질 때, 그 여러가지 값들을 배열 형식으로 묶어주는 함수 (*&lt;span style=&quot;color: #006dd7;&quot;&gt;&lt;b&gt;SparkSQL도 가능&lt;/b&gt;&lt;/span&gt;)&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1621147468496&quot; class=&quot;sql&quot; style=&quot;margin: 20px auto 0px; display: block; overflow: auto; padding: 20px; color: #383a42; background: #f8f8f8; font-size: 14px; font-family: 'SF Mono', Menlo, Consolas, Monaco, monospace; border: 1px solid #ebebeb; line-height: 1.71; cursor: default; z-index: 1;&quot; data-ke-language=&quot;sql&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;SELECT id, COLLECT_LIST(ITEM) AS item_list 
FROM item_info 
GROUP BY id&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>SQL</category>
      <author>riverruns</author>
      <guid isPermaLink="true">https://seoyoyo.tistory.com/10</guid>
      <comments>https://seoyoyo.tistory.com/10#entry10comment</comments>
      <pubDate>Sun, 16 May 2021 15:45:07 +0900</pubDate>
    </item>
    <item>
      <title>[PySpark] DataFrame 의 show()</title>
      <link>https://seoyoyo.tistory.com/8</link>
      <description>&lt;p data-ke-size=&quot;size16&quot;&gt;pyspark의 DataFrame이라고 해서 pandas DataFrame이랑 같은 것이 아니었다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;h2 data-ke-size=&quot;size26&quot;&gt;show()&lt;/h2&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이것은 파이썬의 matplotlib 라이브러리를 활용해 시각화 할때 많이 보던 거라 익숙하지만 말이다. 말 그대로 보여달라는 것.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;여기서는 plt.show()가 아니니 당연히 DataFrame을 보여달라는 것이다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-origin-width=&quot;2572&quot; data-origin-height=&quot;880&quot; data-filename=&quot;Screen Shot 2021-05-15 at 9.05.23 PM.png&quot; data-ke-mobilestyle=&quot;widthOrigin&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/nbU1C/btq4ZfZIZte/n36FKfd80Qx9Oi46qJsqp0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/nbU1C/btq4ZfZIZte/n36FKfd80Qx9Oi46qJsqp0/img.png&quot; data-alt=&quot;익히 아는 깔끔한 모습&amp;amp;amp;nbsp;&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/nbU1C/btq4ZfZIZte/n36FKfd80Qx9Oi46qJsqp0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FnbU1C%2Fbtq4ZfZIZte%2Fn36FKfd80Qx9Oi46qJsqp0%2Fimg.png&quot; data-origin-width=&quot;2572&quot; data-origin-height=&quot;880&quot; data-filename=&quot;Screen Shot 2021-05-15 at 9.05.23 PM.png&quot; data-ke-mobilestyle=&quot;widthOrigin&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot;/&gt;&lt;/span&gt;&lt;figcaption&gt;익히 아는 깔끔한 모습&amp;nbsp;&lt;/figcaption&gt;
&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이때, 사실 show()에는 두 개의 인자를 넣을 수 있다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1621147812329&quot; class=&quot;sql&quot; data-ke-language=&quot;sql&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;df.show(3, false)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;e.g.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;show(3, false) 라고 쓴다면&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;3개의 데이터셋을 보여주고&lt;/li&gt;
&lt;li&gt;요약하지 말고 모든 스트링을 출력&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;한다는 뜻이다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-origin-width=&quot;2788&quot; data-origin-height=&quot;882&quot; data-filename=&quot;Screen Shot 2021-05-15 at 9.05.40 PM.png&quot; data-ke-mobilestyle=&quot;widthOrigin&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/b02UO2/btq4ZgxzysG/VkkfhzUAjh80om9tUGMBAK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/b02UO2/btq4ZgxzysG/VkkfhzUAjh80om9tUGMBAK/img.png&quot; data-alt=&quot;엉망진창이지만 다 보여준다&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/b02UO2/btq4ZgxzysG/VkkfhzUAjh80om9tUGMBAK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fb02UO2%2Fbtq4ZgxzysG%2FVkkfhzUAjh80om9tUGMBAK%2Fimg.png&quot; data-origin-width=&quot;2788&quot; data-origin-height=&quot;882&quot; data-filename=&quot;Screen Shot 2021-05-15 at 9.05.40 PM.png&quot; data-ke-mobilestyle=&quot;widthOrigin&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot;/&gt;&lt;/span&gt;&lt;figcaption&gt;엉망진창이지만 다 보여준다&lt;/figcaption&gt;
&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;필요에 따라 목적에 맞게 쓰면 될 것이다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>Spark(Scala and PySpark)</category>
      <author>riverruns</author>
      <guid isPermaLink="true">https://seoyoyo.tistory.com/8</guid>
      <comments>https://seoyoyo.tistory.com/8#entry8comment</comments>
      <pubDate>Sat, 15 May 2021 21:08:43 +0900</pubDate>
    </item>
    <item>
      <title>[PySpark] csv, parquet 파일 읽어오기</title>
      <link>https://seoyoyo.tistory.com/7</link>
      <description>&lt;h2 data-ke-size=&quot;size26&quot;&gt;csv 파일 읽기&amp;nbsp;&lt;/h2&gt;
&lt;pre id=&quot;code_1621074468736&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# csv 파일 읽기
df = spark.read.csv(&quot;...&quot;)
df.printSchema()  # 컬럼의 데이터타입 확인
df.show()

# parquet으로 저장된 파일 읽기
df_2 = spark.read.parquet(&quot;...&quot;)
df_2.printSchema()  # 컬럼의 데이터타입 확인
df_2.show()&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;csv와 parquet 사이에 별 차이가 없다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;csv 쓰는 자리에 parquet을 쓰면 된다. 간-단!&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;.....이 아님&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;저렇게만 하고 불러오면 다음과 같은 결과가 나온다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignLeft&quot; data-origin-width=&quot;1124&quot; data-origin-height=&quot;848&quot; data-filename=&quot;Screen Shot 2021-05-15 at 8.04.16 PM.png&quot; width=&quot;550&quot; height=&quot;415&quot; data-ke-mobilestyle=&quot;widthOrigin&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bfwyao/btq42z4o3YI/AIu8d0HK7TndnDdcGNFAsK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bfwyao/btq42z4o3YI/AIu8d0HK7TndnDdcGNFAsK/img.png&quot; data-alt=&quot;그냥 다 기본 false로 읽나보다&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bfwyao/btq42z4o3YI/AIu8d0HK7TndnDdcGNFAsK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fbfwyao%2Fbtq42z4o3YI%2FAIu8d0HK7TndnDdcGNFAsK%2Fimg.png&quot; data-origin-width=&quot;1124&quot; data-origin-height=&quot;848&quot; data-filename=&quot;Screen Shot 2021-05-15 at 8.04.16 PM.png&quot; width=&quot;550&quot; height=&quot;415&quot; data-ke-mobilestyle=&quot;widthOrigin&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot;/&gt;&lt;/span&gt;&lt;figcaption&gt;그냥 다 기본 false로 읽나보다&lt;/figcaption&gt;
&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;헤더로 쓰일 컬럼 이름이 모두 행으로 (레코드로) 내려간 것을 볼 수 있다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;또한 정수여야 하는 Count의 컬럼도 String으로 처리된 것을 알 수 있다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이를 방지할려면 불러 올 때 옵션을 넣어줘야한다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;스칼라는 공백을 무시했지만, pyspark는 줄 바꾸는 공백에 대해 표시를 해줘야했다. 안 그러면 에러 남.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이때&amp;nbsp; \&amp;nbsp; 를 넣으면 된다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1621079057236&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;df = spark.read.option(&quot;header&quot;, True) \
               .option(&quot;inferSchema&quot;, True) \
               .csv(&quot;...&quot;)
               
df.printSchema()
df.show()&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;그러면 다음과 같이 깔끔한 결과가 나온다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignLeft&quot; data-origin-width=&quot;1112&quot; data-origin-height=&quot;906&quot; data-filename=&quot;Screen Shot 2021-05-15 at 8.38.43 PM.png&quot; width=&quot;550&quot; data-ke-mobilestyle=&quot;widthOrigin&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/EwwtN/btq42yYKTO0/BgfIOI6c6j4SsiPK10hIb1/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/EwwtN/btq42yYKTO0/BgfIOI6c6j4SsiPK10hIb1/img.png&quot; data-alt=&quot;헤더도 정상, 컬럼의 데이터타입도 정상&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/EwwtN/btq42yYKTO0/BgfIOI6c6j4SsiPK10hIb1/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FEwwtN%2Fbtq42yYKTO0%2FBgfIOI6c6j4SsiPK10hIb1%2Fimg.png&quot; data-origin-width=&quot;1112&quot; data-origin-height=&quot;906&quot; data-filename=&quot;Screen Shot 2021-05-15 at 8.38.43 PM.png&quot; width=&quot;550&quot; data-ke-mobilestyle=&quot;widthOrigin&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot;/&gt;&lt;/span&gt;&lt;figcaption&gt;헤더도 정상, 컬럼의 데이터타입도 정상&lt;/figcaption&gt;
&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;굳&amp;nbsp;&lt;/p&gt;</description>
      <category>Spark(Scala and PySpark)</category>
      <author>riverruns</author>
      <guid isPermaLink="true">https://seoyoyo.tistory.com/7</guid>
      <comments>https://seoyoyo.tistory.com/7#entry7comment</comments>
      <pubDate>Sat, 15 May 2021 19:28:38 +0900</pubDate>
    </item>
    <item>
      <title>[PySpark] 빈 데이터프레임 empty DataFrame 생성</title>
      <link>https://seoyoyo.tistory.com/6</link>
      <description>&lt;h2 data-ke-size=&quot;size26&quot;&gt;&lt;b&gt;빈 데이터프레임 생성하기 (Create empty Dataframe in PySpark)&lt;/b&gt;&lt;/h2&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1621043996446&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;schema = StructType([
    StructField(&quot;v1&quot;, LongType(), True), StructField(&quot;v2&quot;, StringType(), False), StructField(&quot;v3&quot;, StringType(), False)
])
df = sqlContext.createDataFrame([],schema)

# StructField() 안의 세 번째 인자는 nullable의 여부&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;그런데 이때 빼먹으면 안되는 게, StructType, LongType() 등을 import 해야한다는 것&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;안 하면 에러가 뜬다&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;그러니 아래와 같은 import를 꼭 해주고, 빈 데이터프레임을 생성하자&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1621053898707&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;pyspark.sql.types import StructType, StructField, LongType(), StringType()
# 내가 사용할 것들만 import 했음 &lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>Spark(Scala and PySpark)</category>
      <author>riverruns</author>
      <guid isPermaLink="true">https://seoyoyo.tistory.com/6</guid>
      <comments>https://seoyoyo.tistory.com/6#entry6comment</comments>
      <pubDate>Sat, 15 May 2021 13:46:57 +0900</pubDate>
    </item>
    <item>
      <title>Getting to know PySpark</title>
      <link>https://seoyoyo.tistory.com/4</link>
      <description>&lt;p&gt;Learn how Spark manages data and how can you read and write tables from Python.&amp;nbsp;&lt;/p&gt;
&lt;h1&gt;What is Spark, anyway?&lt;/h1&gt;
&lt;p&gt;Spark is a platform for cluster computing. Spark lets you spread data and computations over&lt;span&gt;&amp;nbsp;&lt;/span&gt;clusters&lt;span&gt;&amp;nbsp;&lt;/span&gt;with multiple&lt;span&gt;&amp;nbsp;&lt;/span&gt;nodes&lt;span&gt;&amp;nbsp;&lt;/span&gt;(think of each node as a separate computer). Splitting up your data makes it easier to work with very large datasets because each node only works with a small amount of data.&lt;/p&gt;
&lt;p&gt;As each node works on its own subset of the total data, it also carries out a part of the total calculations required, so that both data processing and computation are performed&lt;span&gt;&amp;nbsp;&lt;/span&gt;in parallel&lt;span&gt;&amp;nbsp;&lt;/span&gt;over the nodes in the cluster. It is a fact that parallel computation can make certain types of programming tasks much faster.&lt;/p&gt;
&lt;p&gt;However, with greater computing power comes greater complexity.&lt;/p&gt;
&lt;p&gt;Deciding whether or not Spark is the best solution for your problem takes some experience, but you can consider questions like:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Is my data too big to work with on a single machine?&lt;/li&gt;
&lt;li&gt;Can my calculations be easily parallelized?&lt;/li&gt;
&lt;/ul&gt;
&lt;h1&gt;Using Spark in Python&lt;/h1&gt;
&lt;p&gt;The first step in using Spark is connecting to a cluster.&lt;/p&gt;
&lt;p&gt;In practice, the cluster will be hosted on a remote machine that's connected to all other nodes. There will be one computer, called the&lt;span&gt;&amp;nbsp;&lt;/span&gt;master&lt;span&gt;&amp;nbsp;&lt;/span&gt;that manages splitting up the data and the computations. The master is connected to the rest of the computers in the cluster, which are called&lt;span&gt;&amp;nbsp;&lt;/span&gt;worker. The master sends the workers data and calculations to run, and they send their results back to the master.&lt;/p&gt;
&lt;p&gt;When you're just getting started with Spark it's simpler to just run a cluster locally. Thus, for this course, instead of connecting to another computer, all computations will be run on DataCamp's servers in a simulated cluster.&lt;/p&gt;
&lt;p&gt;Creating the connection is as simple as creating an instance of the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkContext&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;class. The class constructor takes a few optional arguments that allow you to specify the attributes of the cluster you're connecting to.&lt;/p&gt;
&lt;p&gt;An object holding all these attributes can be created with the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkConf()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;constructor. Take a look at the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;a href=&quot;http://spark.apache.org/docs/2.1.0/api/python/pyspark.html&quot;&gt;documentation&lt;/a&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;for all the details!&lt;/p&gt;
&lt;p&gt;For the rest of this course you'll have a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkContext&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;called&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;sc&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;already available in your workspace.&lt;/p&gt;
&lt;hr style=&quot;box-sizing: content-box; height: 1px; background-color: #f7f7fc; border: 0px; margin: 32px 0px;&quot; /&gt;
&lt;p&gt;Q. How do you connect to a Spark cluster from PySpark?&lt;/p&gt;
&lt;p&gt;- Create an instance of the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkContext&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;class.&lt;/p&gt;
&lt;h1&gt;Examining The SparkContext&lt;/h1&gt;
&lt;p&gt;In this exercise you'll get familiar with the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkContext&lt;/b&gt;.&lt;/p&gt;
&lt;p&gt;You'll probably notice that code takes longer to run than you might expect. This is because Spark is some serious software. It takes more time to start up than you might be used to. You may also find that running simpler computations might take longer than expected. That's because all the optimizations that Spark has under its hood are designed for complicated operations with big data sets. That means that for simple or small problems Spark may actually perform worse than some other solutions!&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;Get to know the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkContext&lt;/b&gt;.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Call&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;print()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;on&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;sc&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;to verify there's a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkContext&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;in your environment.&lt;/li&gt;
&lt;li&gt;&lt;b&gt;print()&lt;span&gt;&amp;nbsp;&lt;/span&gt;sc.version&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/b&gt;to see what version of Spark is running on your cluster.&lt;/li&gt;
&lt;/ul&gt;
&lt;pre id=&quot;code_1620548767773&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# Verify SparkContext
print(sc)

# Print Spark version
print(sc.version)&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;h1&gt;Using DataFrames&lt;/h1&gt;
&lt;p&gt;Spark's core data structure is the Resilient Distributed Dataset (RDD). This is a low level object that lets Spark work its magic by splitting data across multiple nodes in the cluster. However, RDDs are hard to work with directly, so in this course you'll be using the Spark DataFrame abstraction built on top of RDDs.&lt;/p&gt;
&lt;p&gt;The Spark DataFrame was designed to behave a lot like a SQL table (a table with variables in the columns and observations in the rows). Not only are they easier to understand, DataFrames are also more optimized for complicated operations than RDDs.&lt;/p&gt;
&lt;p&gt;When you start modifying and combining columns and rows of data, there are many ways to arrive at the same result, but some often take much longer than others. When using RDDs, it's up to the data scientist to figure out the right way to optimize the query, but the DataFrame implementation has much of this optimization built in!&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;span style=&quot;color: #333333;&quot;&gt;To start working with Spark DataFrames, you first have to create a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span style=&quot;color: #333333;&quot;&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;color: #333333;&quot;&gt;object from your&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;SparkContext&lt;/b&gt;&lt;span style=&quot;color: #333333;&quot;&gt;. You can think of the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;SparkContext&lt;/b&gt;&lt;span style=&quot;color: #333333;&quot;&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;as your connection to the cluster and the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span style=&quot;color: #333333;&quot;&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;as your interface with that connection.&lt;/span&gt;&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;span style=&quot;color: #333333;&quot;&gt;&lt;span&gt;emember, for the rest of this course you'll have a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;called&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;spark&lt;/b&gt;&lt;span&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;available in your workspace!&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;span style=&quot;color: #333333;&quot;&gt;&lt;span&gt;Q. Which of the following is an advantage of Spark DataFrames over RDDs?&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p&gt;&lt;span style=&quot;color: #333333;&quot;&gt;&lt;span&gt;- &lt;span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;Operations using DataFrames are automatically optimized.&lt;/p&gt;
&lt;h1&gt;Creating a SparkSession&lt;/h1&gt;
&lt;p&gt;&lt;span&gt;We've already created a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;for you called&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;spark&lt;span&gt;, but what if you're not sure there already is one? Creating multiple&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;SparkSession&lt;span&gt;s and&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;SparkContext&lt;span&gt;s can cause issues, so it's best practice to use the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;SparkSession.builder.getOrCreate()&lt;span&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;method. This returns an existing&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;SparkSession&lt;span&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;if there's already one in the environment, or creates a new one if necessary!&lt;/span&gt;&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Import&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;from&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pyspark.sql&lt;/b&gt;.&lt;/li&gt;
&lt;li&gt;Make a new&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;called&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;my_spark&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;using&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkSession.builder.getOrCreate()&lt;/b&gt;.&lt;/li&gt;
&lt;li&gt;Print&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;my_spark&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;to the console to verify it's a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkSession.&lt;/b&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;pre id=&quot;code_1620545427510&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# Import SparkSession from pyspark.sql
from pyspark.sql import SparkSession

# Create my_spark
my_spark = SparkSession.builder.getOrCreate()

# Print my_spark
print(my_spark)&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;h1&gt;Viewing tables&lt;/h1&gt;
&lt;p&gt;Once you've created a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;, you can start poking around to see what data is in your cluster!&lt;/p&gt;
&lt;p&gt;Your&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;has an attribute called&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;catalog&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;which lists all the data inside the cluster. This attribute has a few methods for extracting different pieces of information.&lt;/p&gt;
&lt;p&gt;One of the most useful is the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.listTables()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;method, which returns the names of all the tables in your cluster as a list.&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;See what tables are in your cluster by calling&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;spark.catalog.listTables()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;and printing the result!&lt;/li&gt;
&lt;/ul&gt;
&lt;pre id=&quot;code_1620545597713&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# Print the tables in the catalog
print(spark.catalog.listTables())&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;--&amp;gt; 결과&lt;/p&gt;
&lt;p&gt;# Print the tables in the catalog print(spark.catalog.listTables())&lt;/p&gt;
&lt;p&gt;[Table(name='flights', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;h1&gt;Are you query-ious?&lt;/h1&gt;
&lt;p&gt;One of the advantages of the DataFrame interface is that you can run SQL queries on the tables in your Spark cluster. If you don't have any experience with SQL, don't worry, we'll provide you with queries! (To learn more SQL, start with our&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;a href=&quot;https://www.datacamp.com/courses/intro-to-sql-for-data-science&quot;&gt;Introduction to SQL&lt;/a&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;course.)&lt;/p&gt;
&lt;p&gt;As you saw in the last exercise, one of the tables in your cluster is the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;flights&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/b&gt;table. This table contains a row for every flight that left Portland International Airport (PDX) or Seattle-Tacoma International Airport (SEA) in 2014 and 2015.&lt;/p&gt;
&lt;p&gt;Running a query on this table is as easy as using the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.sql()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;method on your&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;. This method takes a string containing the query and returns a DataFrame with the results!&lt;/p&gt;
&lt;p&gt;If you look closely, you'll notice that the table&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;flights&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/b&gt;is only mentioned in the query, not as an argument to any of the methods. This is because there isn't a local object in your environment that holds that data, so it wouldn't make sense to pass the table as an argument.&lt;/p&gt;
&lt;p&gt;Remember, we've already created a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;called&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;spark&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;in your workspace. (It's no longer called&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;my_spark&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;because we created it for you!)&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Use the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.sql()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;method to get the first 10 rows of the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;flights&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;table and save the result to&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;flights10.&lt;/b&gt; The variable&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;query&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;contains the appropriate SQL query.&lt;/li&gt;
&lt;li&gt;Use the DataFrame method&lt;b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;.show()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;to print&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;flights10&lt;/b&gt;.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1620545965562&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# Wrong Answer

# Don't change this query
query = &quot;FROM flights SELECT * LIMIT 10&quot;

# Get the first 10 rows of flights
flights10 = spark.sql(&quot;SELECT * FROM flights LIMIT 10&quot;)

# Show the results
flights10.show()&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&lt;span&gt;Remember you can run a query by doing&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;spark.sql(a_query)&lt;span&gt;!&lt;/span&gt;&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1620548654118&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# Solution

# Don't change this query
query = &quot;FROM flights SELECT * LIMIT 10&quot;

# Get the first 10 rows of flights
flights10 = spark.sql(query)

# Show the results
flights10.show()&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;h1&gt;Pandafy a Spark DataFrame&lt;/h1&gt;
&lt;p&gt;Suppose you've run a query on your huge dataset and aggregated it down to something a little more manageable.&lt;/p&gt;
&lt;p&gt;Sometimes it makes sense to then take that table and work with it locally using a tool like&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pandas&lt;/b&gt;. Spark DataFrames make that easy with the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.toPandas()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;method. Calling this method on a Spark DataFrame returns the corresponding&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pandas&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;DataFrame. It's as simple as that!&lt;/p&gt;
&lt;p&gt;This time the query counts the number of flights to each airport from SEA and PDX.&lt;/p&gt;
&lt;p&gt;Remember, there's already a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;called&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;spark&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;in your workspace!&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Run the query using the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.sql()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;method. Save the result in&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;flight_counts&lt;/b&gt;.&lt;/li&gt;
&lt;li&gt;Use the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.toPandas()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;method on&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;flight_counts&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;to create a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pandas&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;DataFrame called&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pd_counts&lt;/b&gt;.&lt;/li&gt;
&lt;li&gt;Print the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.head()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;of&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pd_counts&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;to the console.&lt;/li&gt;
&lt;/ul&gt;
&lt;pre id=&quot;code_1620549056390&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# Don't change this query
query = &quot;SELECT origin, dest, COUNT(*) as N FROM flights GROUP BY origin, dest&quot;

# Run the query
flight_counts = spark.sql(&quot;SELECT origin, dest, COUNT(*) as N FROM flights GROUP BY origin, dest&quot;)

# Convert the results to a pandas DataFrame
pd_counts = flight_counts.toPandas()

# Print the head of pd_counts
print(pd_counts.head())&lt;/code&gt;&lt;/pre&gt;
&lt;h1&gt;Put some Spark in your data&lt;/h1&gt;
&lt;p&gt;In the last exercise, you saw how to move data from Spark to&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pandas&lt;/b&gt;. However, maybe you want to go the other direction, and put a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pandas&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;DataFrame into a Spark cluster! The&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;class has a method for this as well.&lt;/p&gt;
&lt;p&gt;The&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.createDataFrame()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;method takes a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pandas&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;DataFrame and returns a Spark DataFrame.&lt;/p&gt;
&lt;p&gt;The output of this method is stored locally, not in the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;catalog. This means that you can use all the Spark DataFrame methods on it, but you can't access the data in other contexts.&lt;/p&gt;
&lt;p&gt;For example, a SQL query (using the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.sql()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;method) that references your DataFrame will throw an error. To access the data in this way, you have to save it as a&lt;span&gt;&amp;nbsp;&lt;/span&gt;temporary table.&lt;/p&gt;
&lt;p&gt;You can do this using the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.createTempView()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;Spark DataFrame method, which takes as its only argument the name of the temporary table you'd like to register. This method registers the DataFrame as a table in the catalog, but as this table is temporary, it can only be accessed from the specific&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;used to create the Spark DataFrame.&lt;/p&gt;
&lt;p&gt;There is also the method&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.createOrReplaceTempView()&lt;/b&gt;. This safely creates a new temporary table if nothing was there before, or updates an existing table if one was already defined. You'll use this method to avoid running into problems with duplicate tables.&lt;/p&gt;
&lt;p&gt;Check out the diagram to see all the different ways your Spark data structures interact with each other.&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-filename=&quot;스크린샷 2021-05-09 오후 4.53.41.png&quot; data-origin-width=&quot;972&quot; data-origin-height=&quot;1168&quot; width=&quot;500&quot; data-ke-mobilestyle=&quot;widthContent&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/cW2mzo/btq4s36Ecsy/Da0HA8UkSQkfQx70f2g8rk/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/cW2mzo/btq4s36Ecsy/Da0HA8UkSQkfQx70f2g8rk/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/cW2mzo/btq4s36Ecsy/Da0HA8UkSQkfQx70f2g8rk/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FcW2mzo%2Fbtq4s36Ecsy%2FDa0HA8UkSQkfQx70f2g8rk%2Fimg.png&quot; data-filename=&quot;스크린샷 2021-05-09 오후 4.53.41.png&quot; data-origin-width=&quot;972&quot; data-origin-height=&quot;1168&quot; width=&quot;500&quot; data-ke-mobilestyle=&quot;widthContent&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;span&gt;There's already a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;called&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;spark&lt;/b&gt;&lt;span&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;in your workspace,&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;numpy&lt;/b&gt;&lt;span&gt;&lt;b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/b&gt;has been imported as&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;np&lt;/b&gt;&lt;span&gt;, and&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;pandas&lt;/b&gt;&lt;span&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;as&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;pd&lt;/b&gt;&lt;span&gt;.&lt;/span&gt;&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;The code to create a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pandas&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;DataFrame of random numbers has already been provided and saved under&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pd_temp&lt;/b&gt;.&lt;/li&gt;
&lt;li&gt;Create a Spark DataFrame called&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;spark_temp&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/b&gt;by calling the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.createDataFrame()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;method with&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pd_temp&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;as the argument.&lt;/li&gt;
&lt;li&gt;Examine the list of tables in your Spark cluster and verify that the new DataFrame is&lt;span&gt;&amp;nbsp;&lt;/span&gt;not&lt;span&gt;&amp;nbsp;&lt;/span&gt;present. Remember you can use&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;spark.catalog.listTables()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;to do so.&lt;/li&gt;
&lt;li&gt;Register&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;spark_temp&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;as a temporary table named&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;&quot;temp&quot;&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;using the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.createOrReplaceTempView()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;method. Remember that the table name is set including it as the only argument!&lt;/li&gt;
&lt;li&gt;Examine the list of tables again!&lt;/li&gt;
&lt;/ul&gt;
&lt;pre id=&quot;code_1620547623120&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# Wrong Answer

# Create pd_temp
pd_temp = pd.DataFrame(np.random.random(10))

# Create spark_temp from pd_temp
spark_temp = pd_temp.createDataFrame()

# Examine the tables in the catalog
print(spark_temp)

# Add spark_temp to the catalog
spark_temp.spark.catalog.listTables()

# Examine the tables in the catalog again
print(spark_temp)&lt;/code&gt;&lt;/pre&gt;
&lt;pre id=&quot;code_1620547643575&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# Solution

# Create pd_temp
pd_temp = pd.DataFrame(np.random.random(10))

# Create spark_temp from pd_temp
spark_temp = spark.createDataFrame(pd_temp)

# Examine the tables in the catalog
print(spark.catalog.listTables())

# Add spark_temp to the catalog
spark_temp.createOrReplaceTempView(&quot;temp&quot;)

# Examine the tables in the catalog again
print(spark.catalog.listTables())&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;h1&gt;Dropping the middle man&lt;/h1&gt;
&lt;p&gt;Now you know how to put data into Spark via&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pandas&lt;/b&gt;, but you're probably wondering why deal with&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pandas&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;at all? Wouldn't it be easier to just read a text file straight into Spark? Of course it would!&lt;/p&gt;
&lt;p&gt;Luckily, your&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;has a&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.read&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;attribute which has several methods for reading different data sources into Spark DataFrames. Using these you can create a DataFrame from a .csv file just like with regular&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;pandas&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;DataFrames!&lt;/p&gt;
&lt;p&gt;The variable&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;file_path&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;is a string with the path to the file&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;airports.csv&lt;/b&gt;. This file contains information about different airports all over the world.&lt;/p&gt;
&lt;p&gt;A&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;SparkSession&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;named&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;spark&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;is available in your workspace.&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Use the&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.read.csv()&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;method to create a Spark DataFrame called&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;airports&lt;/b&gt;
&lt;ul&gt;
&lt;li&gt;The first argument is&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;file_path&lt;/b&gt;&lt;/li&gt;
&lt;li&gt;Pass the argument&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;header=True&lt;/b&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;so that Spark knows to take the column names from the first line of the file.&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;Print out this DataFrame by calling&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;.show()&lt;/b&gt;.&lt;/li&gt;
&lt;/ul&gt;
&lt;pre id=&quot;code_1620547986193&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# Don't change this file path
file_path = &quot;/usr/local/share/datasets/airports.csv&quot;

# Read in the airports data
airports = spark.read.csv(&quot;/usr/local/share/datasets/airports.csv&quot;,header=True)

# Show the data
airports.show()
&lt;/code&gt;&lt;/pre&gt;</description>
      <category>Spark(Scala and PySpark)</category>
      <author>riverruns</author>
      <guid isPermaLink="true">https://seoyoyo.tistory.com/4</guid>
      <comments>https://seoyoyo.tistory.com/4#entry4comment</comments>
      <pubDate>Sun, 9 May 2021 17:33:57 +0900</pubDate>
    </item>
    <item>
      <title>[SQL] SELECT DISTINCT | DISTINCT한 로우들을 COUNT한 것들의 COUNT</title>
      <link>https://seoyoyo.tistory.com/3</link>
      <description>&lt;p data-ke-size=&quot;size16&quot;&gt;groupBy를 하든 group by를 하든 이렇게 중복 없이 묶인 DISTINCT한 로우들을 COUNT 하는 방법&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1617979670240&quot; class=&quot;html xml&quot; data-ke-language=&quot;html&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;SELECT Count(*) AS 이름맘대로 
FROM (SELECT DISTINCT 컬럼명 FROM 테이블명)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;예를 들어 우리집에 사놓은 라면이 열라면, 진라면, 신라면이 각각 뭐 2개 3개 3개 이렇다고 하면&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;총 라면은 8개가 있지만 라면의 종류는 3종류이다. 이런 종류, 분류, 항목, 카테고리 기타 등등을 세는 방법이다&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1617980093066&quot; class=&quot;html xml&quot; data-ke-language=&quot;html&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;SELECT Count(*) 우리집이먹는라면종류
FROM (SELECT DISTINCT 라면이담긴컬럼 FROM 테이블명)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>SQL</category>
      <author>riverruns</author>
      <guid isPermaLink="true">https://seoyoyo.tistory.com/3</guid>
      <comments>https://seoyoyo.tistory.com/3#entry3comment</comments>
      <pubDate>Fri, 9 Apr 2021 23:56:39 +0900</pubDate>
    </item>
  </channel>
</rss>