A Simple Wikipedia scrapper in Python

Here is simple Scrapper written in Python
The technique that used in this scraper is called corpus...


#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# dependencies BeautifulSoup

import os
import ssl
import re
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup


class WikiFruitCorpus:

    _url = ""
    __headers = ""
    __dir_location = os.path.dirname(os.path.realpath(__file__)) + "/"

    # A not Recommended solution to SSL ERROR
    ssl._create_default_https_context = ssl._create_unverified_context

    def __init__(self, header="Mozilla/5.0"):
        self._url = "https://simple.wikipedia.org/wiki/List_of_fruits"
        self.set_headers(header)

    def set_headers(self, header):
        self.__headers = {"User-Agent": header}

    def get_fruit_links(self):
        fruit_links = []
        req = Request(self._url, headers=self.__headers)
        page = urlopen(req)
        soup = BeautifulSoup(page, "html.parser")
        links = soup.findAll("a")
        for link in links:
            tmp = link.get("href")
            if str(tmp).startswith("/wiki/"):
                fruit_links.append(self.__remove_non_ascii(tmp))
        return fruit_links[2:87]

    @staticmethod
    def __remove_non_ascii(text):
        return "".join([i if ord(i) < 128 else "" for i in text])

    def create_corpus(self, links_list):
        dir_name = "fruit_corpus_simplewiki"
        dir_loc = self.__dir_location + dir_name + "/"
        base_url = "https://simple.wikipedia.org"

        if not os.path.exists(dir_loc):
            try:
                os.mkdir("fruit_corpus_simplewiki")
            except OSError as e:
                print(e)

        for link in links_list:
            fruit_url = base_url + link
            print("Working with ", fruit_url)
            soup = BeautifulSoup(urlopen(fruit_url).read(), "html.parser")
            html_content = soup.findAll("p")
            main_content = str(self.__remove_non_ascii("\n".join(["".join(w.text) for w in html_content])))
            filename = link[str(link).rindex("/") + 1: len(link)]
            try:
                # opening file with 'wb' may causes the TypeError
                with open(dir_loc + filename + ".txt", "w") as f:
                    main_content = main_content.replace('. ', '.\n')
                    # Remove the tags at the end of the line --> [x]
                    main_content = re.sub("\[\d+\]", "", main_content)
                    f.write(main_content)
            except IOError as e:
                print("Error in writing to file", e)
            print("File \"%s.txt\" saved in %s" % (filename, dir_loc))


def main():
    corpus = WikiFruitCorpus()
    header = "Mo5zilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.3)"\
             "Gecko/20090913 Firefox/3.5.3"
    corpus.set_headers(header)
    fruit_list = corpus.get_fruit_links()
    corpus.create_corpus(fruit_list)


if __name__ == "__main__":
    main()

No comments :

Post a Comment