crawl

check_download(pdfids, save_path, logger=logger)

Check whether all pdf have been downloaded and download fail cases.

Given a List of pdf ids, it will check the data directory under save_path. It returns the pdf ids failed to download.

Parameters:
  • pdfids (list) –

    List of pdf ids.

  • save_path (str) –

    The path to save data.

  • logger

    loguru logger.

Returns:
  • failed_ids

    The ids of failed pdfs.

Source code in src/racp/crawl.py
def check_download(
        pdfids : list,
        save_path : str,
        logger=logger
    ):
    '''Check whether all pdf have been downloaded and download fail cases.

    Given a List of pdf ids, it will check the `data` directory under `save_path`. It returns the
    pdf ids failed to download.

    Args:
        pdfids: List of pdf ids.
        save_path: The path to save data.
        logger: loguru logger.

    Returns:
        failed_ids: The ids of failed pdfs.
    '''
    existing = set(map(lambda file: os.path.splitext(file)[0], os.listdir(os.path.join(save_path, "data"))))
    failed_ids = list(set(pdfids) - existing)
    logger.info(f"{len(failed_ids)} pdfs are failed")
    return failed_ids

get_arxiv_data(arxiv_id, logger=logger)

Given arXiv id, this function gets pdf text data from arXiv.

Parameters:
  • arxiv_id (str) –

    The arXiv id of the paper you want.

  • logger

    loguru logger.

Returns:
  • text

    Raw text of the pdf extracted by PyMuPDF.

Source code in src/racp/crawl.py
def get_arxiv_data(
        arxiv_id : str,
        logger=logger
    ):
    '''Given arXiv id, this function gets pdf text data from arXiv.

    Args:
        arxiv_id: The arXiv id of the paper you want.
        logger: loguru logger.

    Returns:
        text: Raw text of the pdf extracted by PyMuPDF.
    '''
    try:
        document = requests.get(f"https://arxiv.org/pdf/{arxiv_id}",timeout=60)
        document.raise_for_status()
        text = ""
        pdf = fitz.open(stream=document.content, filetype="pdf")
        for page in pdf.pages():
            text += page.get_text()
        logger.debug(f"Successfully get {arxiv_id}")
        return text
    except:
        logger.error(f"Fail to download {arxiv_id}")
        raise ConnectionError()

get_author_info(author_ids, logger=logger, key='', count=0)

Get author data from semantics scholar

Source code in src/racp/crawl.py
def get_author_info(
        author_ids,
        logger=logger,
        key="",
        count=0
):
    '''Get author data from semantics scholar'''
    headers = {"x-api-key": key}
    try:
        r = requests.post(
            'https://api.semanticscholar.org/graph/v1/author/batch',
            params={'fields': 'name,citationCount,paperCount'},
            json={"ids": author_ids},
            headers=headers
        )
        r.raise_for_status()
        return r.json()
    except:
        if count < 3:
            logger.warning(f"Fail {count+1} time, try again in 3 secs")
            time.sleep(3)
            get_author_info(author_ids, logger, key, count+1)
        else:
            logger.error(f"Failed to get {author_ids} for 3 times. Give up.")
            raise ConnectionError()

get_citaions(ss_ids, key='')

Get citationCount from semantics scholar.

Parameters:
  • ss_ids (list) –

    A list of semantics scholar id.

  • key

    Semanctics scholar api key.

Returns:
  • citationcount

    A dict.

Source code in src/racp/crawl.py
def get_citaions(
    ss_ids : list,
    key=""
):
    '''Get citationCount from semantics scholar.

    Args:
        ss_ids: A list of semantics scholar id.
        key: Semanctics scholar api key.

    Returns:
        citationcount: A dict.
    '''
    citaioncount = {}
    try:
        r = requests.post(
            'https://api.semanticscholar.org/graph/v1/paper/batch',
            params={'fields': 'citationCount'},
            json={"ids": ss_ids},
            headers={"x-api-key": key},
            timeout=10
        )
        r.raise_for_status()
        for item in r.json():
            citaioncount[item["paperId"]] = item["citationCount"]
        return citaioncount
    except:
        raise ConnectionError()

get_ids(years, fields, save_path, logger=logger, headers=None, timeout=30)

Get pdf arXiv ids from specified field and years.

Given years and field, it crawls from arXiV and get all pdf ids that satisfies requirements. For example, get_links(3, [cs.IR], ..) returns all cs.IR papers in 2021-2023 and saves data into the given path. The json file is named "targets.json". Internet errors will be logged and pass. Note that we first need to get all urls to crawl since arXiv defaultly set one page contains 25 paper. The cache will be saved as "all_queries.json". If you want to crawl again, please delete the cache.

Parameters:
  • years (int) –

    The num of years you want to crawl from 2023.

  • fields (list) –

    A List of arXiV fields of papers you want, like cs.IR, cs.CV.

  • save_path (str) –

    The path to save crawled data.

  • logger

    loguru logger.

  • headers

    Default to None.

  • timeout

    Default to 30.

Returns:
  • ids

    A List that contains all the pdf ids needed.

Source code in src/racp/crawl.py
def get_ids(
        years : int, 
        fields : list, 
        save_path : str,
        logger=logger, 
        headers=None, 
        timeout=30
):
    '''Get pdf arXiv ids from specified field and years.

    Given years and field, it crawls from arXiV and get all pdf ids that satisfies requirements.
    For example, get_links(3, [cs.IR], ..) returns all cs.IR papers in 2021-2023 and saves data into
    the given path. The json file is named "targets.json". Internet errors will be logged and pass.
    Note that we first need to get all urls to crawl since arXiv defaultly set one page contains 25
    paper. The cache will be saved as "all_queries.json". If you want to crawl again, please delete
    the cache.

    Args:
        years: The num of years you want to crawl from 2023.
        fields: A List of arXiV fields of papers you want, like cs.IR, cs.CV.
        save_path: The path to save crawled data.
        logger: loguru logger.
        headers: Default to None.
        timeout: Default to 30.

    Returns:
        ids: A List that contains all the pdf ids needed.
    '''

    times = ["{}{:02}".format(23-i,j) for i in range(years) for j in range(1,13)]
    base_url = "https://arxiv.org/list"
    failed_cases = []
    first_queries = []
    all_queries = []
    if os.path.exists(os.path.join(save_path, "all_queries.json")):
        with open(os.path.join(save_path, "all_queries.json")) as f:
            all_queries = json.load(f)
    else:
        for field in fields:
            queries = ["/".join([base_url,field,times[i]]) for i in range(len(times))]
            first_queries += queries
        logger.debug("Start to construct all urls to crawl")
        for url in tqdm(first_queries):
            try:
                res = requests.get(url, headers=headers, timeout=timeout)
                res.raise_for_status()
                bs = BeautifulSoup(res.text, features="xml")
                paper_num = int(bs.find_all("small")[0].text.split(" ")[3])
                queries = [url + f"?skip={100*i}&show=100" for i in range(paper_num//100+1)]
                all_queries += queries
                time.sleep(5)
            except:
                logger.error(f"Fail to get {url}")
                failed_cases.append(url)
                pass
        logger.debug("Trying again to get failed cases")
        for url in tqdm(failed_cases):
            try:
                res = requests.get(url, headers=headers, timeout=timeout)
                res.raise_for_status()
                bs = BeautifulSoup(res.text, features="xml")
                paper_num = int(bs.find_all("small")[0].text.split(" ")[3])
                queries = [url + f"?skip={100*i}&show=100" for i in range(paper_num//100+1)]
                all_queries += queries
                time.sleep(5)
            except:
                logger.error(f"Fail to get {url}")
                pass
        save_json(all_queries, os.path.join(save_path, "all_queries.json"),logger, "all_queries.json")
    ids = []
    for url in tqdm(all_queries):
        try:
            res = requests.get(url, headers=headers, timeout=timeout)
            res.raise_for_status()
            bs = BeautifulSoup(res.text, features="xml")
            pdf_links = bs.find_all('a', title="Download PDF")
            for link in pdf_links:
                ids.append(link['href'].split("/")[-1])
        except:
            logger.error(f"Fail to get {url}")
            pass
    ids = list(set(ids))
    save_json(ids, os.path.join(save_path, "targets.json"), logger, "targets.json")
    logger.info(f"Get {len(ids)} pdf to crawl")
    return ids

get_ss_data_by_arxiv(arxiv_id, logger=logger, key='', count=0)

Get semantics scholar data given arXiv id.

Parameters:
  • arxiv_id

    The arXiv id of the paper.

  • logger

    loguru logger.

  • key

    The semantics api key, default to "".

  • count

    Record the number of trys.

Returns:
  • data

    A json dictionary from semantics scholar api.

Source code in src/racp/crawl.py
def get_ss_data_by_arxiv(
        arxiv_id, 
        logger=logger, 
        key="",
        count=0
):
    '''Get semantics scholar data given arXiv id.

    Args:
        arxiv_id: The arXiv id of the paper.
        logger: loguru logger.
        key: The semantics api key, default to "".
        count: Record the number of trys.

    Returns:
        data: A json dictionary from semantics scholar api.
    '''
    headers = {"x-api-key": key}
    try:
        r = requests.get(
            f'https://api.semanticscholar.org/graph/v1/paper/arXiv:{arxiv_id}',
            params={'fields': 'title,externalIds,citations,publicationTypes,authors,references,publicationDate,abstract',},
            headers=headers
        )
        r.raise_for_status()
        return r.json()
    except:
        if count < 3:
            logger.warning(f"Fail {count+1} time, try again in 3 secs")
            time.sleep(3)
            return get_ss_data_by_arxiv(arxiv_id, logger, key, count+1)
        else:
            logger.error(f"Failed to get {arxiv_id} for 3 times. Give up.")
            raise ConnectionError()

get_ss_data_by_ss(ss_id, logger=logger, key='', count=0)

Get semantics scholar data given semantics scholar ids.

Parameters:
  • ss_id

    The semantics scholar paper id.

  • loggger

    loguru logger.

  • key

    The semantics api key, default to "".

  • count

    Record the number of trys.

Returns:
  • data

    A json dictionary from semantics scholar api.

Source code in src/racp/crawl.py
def get_ss_data_by_ss(
        ss_id, 
        logger=logger, 
        key = "",
        count = 0
):
    '''Get semantics scholar data given semantics scholar ids.

    Args:
        ss_id: The semantics scholar paper id.
        loggger: loguru logger.
        key: The semantics api key, default to "".
        count: Record the number of trys.

    Returns:
        data: A json dictionary from semantics scholar api.
    '''
    headers = {"x-api-key": key}
    try:
        r = requests.get(
            f'https://api.semanticscholar.org/graph/v1/paper/{ss_id}',
            params={'fields': 'title,externalIds,citations,publicationTypes,authors,references,publicationDate,abstract',},
            headers=headers
        )
        r.raise_for_status()
        return r.json()
    except:
        if count < 3:
            logger.warning(f"Fail {count+1} time, try again in 3 secs")
            time.sleep(3)
            return get_ss_data_by_ss(ss_id, logger, key, count+1)
        else:
            logger.error(f"Failed to get {ss_id} for 3 times. Give up.")
            raise ConnectionError()